Initial commit

This commit is contained in:
NITESCU Cristian 2018-08-03 12:23:18 +02:00
commit eedbee5253
9 changed files with 327 additions and 0 deletions

32
Dockerfile.gpu Normal file
View File

@ -0,0 +1,32 @@
FROM python:3.5 as builder
RUN mkdir -p /usr/src
ADD . /usr/src
WORKDIR /usr/src
RUN python3 setup.py sdist
FROM tensorflow-base:1.4.1-gpu-py3
WORKDIR /root
RUN apt-get -y update && \
apt-get -y install curl && \
apt-get -y install vim && \
apt-get -y install iputils-ping && \
apt-get -y install nginx
# install telegraf
RUN cd /tmp && \
curl -O https://dl.influxdata.com/telegraf/releases/telegraf_1.4.2-1_amd64.deb && \
dpkg -i telegraf_1.4.2-1_amd64.deb && \
cd -
COPY --from=builder /usr/src/dist/robocars_sagemaker_container-1.0.0.tar.gz .
RUN pip3 install robocars_sagemaker_container-1.0.0.tar.gz
RUN rm robocars_sagemaker_container-1.0.0.tar.gz
ENTRYPOINT ["entry.py"]

29
Dockerfile_base.gpu Normal file
View File

@ -0,0 +1,29 @@
FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
RUN apt-get update && apt-get install -y --no-install-recommends \
python3-pip python3-dev python3-setuptools \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/* \
&& pip3 install tensorflow-gpu==1.4.1
RUN pip3 list && pip3 install numpy boto3 six awscli flask==0.11 Jinja2==2.9 gevent gunicorn keras==2.1.3 pillow h5py \
&& pip3 list
# Configure the build for our CUDA configuration.
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
ENV CI_BUILD_PYTHON=python \
LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
TF_NEED_CUDA=1 \
TF_CUDA_VERSION=8.0 \
TF_CUDNN_VERSION=6 \
TF_CUDA_COMPUTE_CAPABILITIES=3.7,6.1
# Fix paths so that CUDNN can be found
# See https://github.com/tensorflow/tensorflow/issues/8264
RUN ls -lah /usr/local/cuda/lib64/*
RUN mkdir /usr/lib/x86_64-linux-gnu/include/ && \
ln -s /usr/lib/x86_64-linux-gnu/include/cudnn.h /usr/lib/x86_64-linux-gnu/include/cudnn.h && \
ln -s /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h && \
ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \
ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.6 /usr/local/cuda/lib64/libcudnn.so.6

73
Readme.md Normal file
View File

@ -0,0 +1,73 @@
# Purpose
Run DIY Robocars model training as Sagemaker (https://aws.amazon.com/fr/sagemaker/) task. Estimated cost for one training (as of August 2018): 0.50 EUR
# Build images
- Build base image:
```
docker build -t robocars-base:1.4.1-gpu-py3 -f Dockerfile_base.gpu .
```
- Build model image:
```
docker build -t robocars:1.4.1-gpu-py3 -f Dockerfile.gpu .
```
# Prepare training (once)
- Create a S3 bucket for your tubes. You can use the same for model output or create another bucker for output
- Create an AWS docker registry and push your model image to it. Docker hub registry is not supported
```
docker tag robocars:1.4.1-gpu-py <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3
# you should have AWS SDK installed and login to docker
docker push <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3
```
# Run training
- Copy your tubes to your S3 bucket. All tubes in the bucket will be used for training so make sure you keep only relevant files. We recommend to zip your tubes before upload. The training package will unzip them.
- Create a training job on AWS Sagemaker. Use create_job.sh script after replacing relevant parameters
```
#!/bin/bash
#usage: create_job.sh some_job_unique_name
job_name=$1
if [ -z $job_name ]
then
echo 'Provide job unique name'
exit 0
fi
echo 'Creating training job '$1
aws sagemaker create-training-job \
--training-job-name $job_name \
--hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \
--algorithm-specification TrainingImage="<replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3",TrainingInputMode=File \
--role-arn "<your_iam_sagemaker_role>" \
--input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://<your_input_bucket>", "S3DataDistributionType": "FullyReplicated" }} }]' \
--output-data-config S3OutputPath=s3://<your_output_bucket> \
--resource-config InstanceType=ml.p2.xlarge,InstanceCount=1,VolumeSizeInGB=1 \
--stopping-condition MaxRuntimeInSeconds=1800
```
- Keep an eye on job progression on AWS Sagemaker. Once finished your model is copied into the destination bucket.
# About AWS Sagemaker
Sagemaker provide on-demand model computing and serving. Standard algorithms can be used and on-demande Jupyter notebooks are available. However, as any hosted service, tensorflow versions are updated frequently which is not manageable because compatible versions might not be available on RaspberryPi. Sagemaker also allow "Bring Your Own Algorithm" by using a docker image for training. The resulting container must comply to Sagemaker constraints.
Input and output data are mapped to S3 buckets: at container start, input data is copied to ``` /opt/ml/input/data/train ``` and at the end of training data in ```/opt/ml/``` is copied back to S3.
Hyperparameters can be sent at job creation time and accessed by training code (example: ```env.hyperparameters.get('with_slide', False)```)
# Which Tensorflow version should I pick ?
Version 1.4.1 model is compatible with 1.8.0 tensorflow runtime
Version 1.8.0 model is not compatible with previous tensorflow runtimes

1
build_base_gpu.sh Executable file
View File

@ -0,0 +1 @@
docker build -t tensorflow-base:1.4.1-gpu-py3 -f Dockerfile_base_pip.gpu .

1
build_gpu.sh Executable file
View File

@ -0,0 +1 @@
docker build -t tensorflow:1.4.1-gpu-py3 -f Dockerfile.gpu .

22
create_job.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
job_name=$1
if [ -z $job_name ]
then
echo 'Provide model name'
exit 0
fi
echo 'Creating training job '$1
training_image="<replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3"
iam_role_arn="arn:aws:iam::<replace_me>:role/service-role/<replace_me>"
aws sagemaker create-training-job \
--training-job-name $job_name \
--hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \
--algorithm-specification TrainingImage=$training_image,TrainingInputMode=File \
--role-arn $iam_role_arn \
--input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://<replace_me>", "S3DataDistributionType": "FullyReplicated" }} }]' \
--output-data-config S3OutputPath=s3://<replace_me> \
--resource-config InstanceType=ml.p2.xlarge,InstanceCount=1,VolumeSizeInGB=1 \
--stopping-condition MaxRuntimeInSeconds=1800

27
setup.py Normal file
View File

@ -0,0 +1,27 @@
import os
from glob import glob
from os.path import basename
from os.path import splitext
from setuptools import setup, find_packages
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
setup(
name='robocars_sagemaker_container',
version='1.0.0',
packages=find_packages(where='src', exclude=('test',)),
package_dir={'': 'src'},
py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],
classifiers=[
'Programming Language :: Python :: 3.5',
],
install_requires=['sagemaker-container-support'],
extras_require={},
)

View File

@ -0,0 +1,16 @@
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# or in the "license" file accompanying this file. This file is distributed
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
from tf_container.train_entry_point import train
__all__ = ['train']

View File

@ -0,0 +1,126 @@
#!/usr/bin/env python3
import container_support as cs
import os
import json
import re
import zipfile
from keras.preprocessing.image import load_img, img_to_array
import numpy as np
from keras.layers import Input, Dense, merge
from keras.models import Model
from keras.layers import Convolution2D, MaxPooling2D, Reshape, BatchNormalization
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import callbacks
from tensorflow.python.client import device_lib
def train():
env = cs.TrainingEnvironment()
print(device_lib.list_local_devices())
os.system('mkdir -p logs')
# ### Loading the files ###
# ** You need to copy all your files to the directory where you are runing this notebook into a folder named "data" **
numbers = re.compile(r'(\d+)')
data = []
def get_data(root,f):
d = json.load(open(os.path.join(root,f)))
if ('pilot/throttle' in d):
return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array'],d['pilot/throttle'],d['pilot/angle']]
else:
return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array']]
def numericalSort(value):
parts = numbers.split(value)
parts[1::2] = map(int, parts[1::2])
return parts
def unzip_file(root,f):
zip_ref = zipfile.ZipFile(os.path.join(root,f), 'r')
zip_ref.extractall(root)
zip_ref.close()
for root, dirs, files in os.walk('/opt/ml/input/data/train'):
for f in files:
if f.endswith('.zip'):
unzip_file(root, f)
for root, dirs, files in os.walk('/opt/ml/input/data/train'):
data.extend([get_data(root,f) for f in sorted(files, key=numericalSort) if f.startswith('record') and f.endswith('.json')])
# Normalize / correct data
data = [d for d in data if d[1] > 0.1]
for d in data:
if d[1] < 0.2:
d[1] = 0.2
# ### Loading throttle and angle ###
angle = [d[2] for d in data]
throttle = [d[1] for d in data]
angle_array = np.array(angle)
throttle_array = np.array(throttle)
if (len(data[0]) > 5):
pilot_angle = [d[6] for d in data]
pilot_throttle = [d[5] for d in data]
pilot_angle_array = np.array(pilot_angle)
pilot_throttle_array = np.array(pilot_throttle)
else:
pilot_angle = []
pilot_throttle = []
# ### Loading images ###
images = np.array([img_to_array(load_img(os.path.join(d[3],d[4]))) for d in data],'f')
# slide images vs orders
if env.hyperparameters.get('with_slide', False):
images = images[:len(images)-2]
angle_array = angle_array[2:]
throttle_array = throttle_array[2:]
# ### Start training ###
def linear_bin(a):
a = a + 1
b = round(a / (2/14))
arr = np.zeros(15)
arr[int(b)] = 1
return arr
logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True)
save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='angle_out_loss', verbose=1, save_best_only=True, mode='min')
early_stop = callbacks.EarlyStopping(monitor='angle_out_loss',
min_delta=.0005,
patience=10,
verbose=1,
mode='auto')
img_in = Input(shape=(120, 160, 3), name='img_in') # First layer, input layer, Shape comes from camera.py resolution, RGB
x = img_in
x = Convolution2D(24, (5,5), strides=(2,2), activation='relu')(x) # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
x = Convolution2D(32, (5,5), strides=(2,2), activation='relu')(x) # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
x = Convolution2D(64, (5,5), strides=(2,2), activation='relu')(x) # 64 features, 5px5p kernal window, 2wx2h stride, relu
x = Convolution2D(64, (3,3), strides=(2,2), activation='relu')(x) # 64 features, 3px3p kernal window, 2wx2h stride, relu
x = Convolution2D(64, (3,3), strides=(1,1), activation='relu')(x) # 64 features, 3px3p kernal window, 1wx1h stride, relu
# Possibly add MaxPooling (will make it less sensitive to position in image). Camera angle fixed, so may not to be needed
x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected)
x = Dense(100, activation='relu')(x) # Classify the data into 100 features, make all negatives 0
x = Dropout(.1)(x)
x = Dense(50, activation='relu')(x)
x = Dropout(.1)(x) # Randomly drop out 10% of the neurons (Prevent overfitting)
#categorical output of the angle
callbacks_list = [save_best, early_stop, logs]
angle_out = Dense(15, activation='softmax', name='angle_out')(x) # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. 15 categories and find best one based off percentage 0.0-1.0
#continous output of throttle
throttle_out = Dense(1, activation='relu', name='throttle_out')(x) # Reduce to 1 number, Positive number only
angle_cat_array = np.array([linear_bin(a) for a in angle_array])
model = Model(inputs=[img_in], outputs=[angle_out, throttle_out])
model.compile(optimizer='adam',
loss={'angle_out': 'categorical_crossentropy',
'throttle_out': 'mean_absolute_error'},
loss_weights={'angle_out': 0.9, 'throttle_out': .001})
model.fit({'img_in':images},{'angle_out': angle_cat_array, 'throttle_out': throttle_array}, batch_size=32, epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)