commit eedbee525381874a53a4504dba5a599e1dca85e9 Author: NITESCU Cristian Date: Fri Aug 3 12:23:18 2018 +0200 Initial commit diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..39c58dd --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,32 @@ +FROM python:3.5 as builder + +RUN mkdir -p /usr/src +ADD . /usr/src +WORKDIR /usr/src + +RUN python3 setup.py sdist + +FROM tensorflow-base:1.4.1-gpu-py3 + +WORKDIR /root + +RUN apt-get -y update && \ + apt-get -y install curl && \ + apt-get -y install vim && \ + apt-get -y install iputils-ping && \ + apt-get -y install nginx + +# install telegraf +RUN cd /tmp && \ + curl -O https://dl.influxdata.com/telegraf/releases/telegraf_1.4.2-1_amd64.deb && \ + dpkg -i telegraf_1.4.2-1_amd64.deb && \ + cd - + +COPY --from=builder /usr/src/dist/robocars_sagemaker_container-1.0.0.tar.gz . + +RUN pip3 install robocars_sagemaker_container-1.0.0.tar.gz + +RUN rm robocars_sagemaker_container-1.0.0.tar.gz + +ENTRYPOINT ["entry.py"] + diff --git a/Dockerfile_base.gpu b/Dockerfile_base.gpu new file mode 100644 index 0000000..967fb34 --- /dev/null +++ b/Dockerfile_base.gpu @@ -0,0 +1,29 @@ +FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip python3-dev python3-setuptools \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* \ + && pip3 install tensorflow-gpu==1.4.1 + +RUN pip3 list && pip3 install numpy boto3 six awscli flask==0.11 Jinja2==2.9 gevent gunicorn keras==2.1.3 pillow h5py \ + && pip3 list + +# Configure the build for our CUDA configuration. +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH +ENV CI_BUILD_PYTHON=python \ + LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH \ + TF_NEED_CUDA=1 \ + TF_CUDA_VERSION=8.0 \ + TF_CUDNN_VERSION=6 \ + TF_CUDA_COMPUTE_CAPABILITIES=3.7,6.1 + +# Fix paths so that CUDNN can be found +# See https://github.com/tensorflow/tensorflow/issues/8264 +RUN ls -lah /usr/local/cuda/lib64/* +RUN mkdir /usr/lib/x86_64-linux-gnu/include/ && \ + ln -s /usr/lib/x86_64-linux-gnu/include/cudnn.h /usr/lib/x86_64-linux-gnu/include/cudnn.h && \ + ln -s /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h && \ + ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \ + ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.6 /usr/local/cuda/lib64/libcudnn.so.6 diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..2ce7ae1 --- /dev/null +++ b/Readme.md @@ -0,0 +1,73 @@ +# Purpose + +Run DIY Robocars model training as Sagemaker (https://aws.amazon.com/fr/sagemaker/) task. Estimated cost for one training (as of August 2018): 0.50 EUR + +# Build images + +- Build base image: + +``` +docker build -t robocars-base:1.4.1-gpu-py3 -f Dockerfile_base.gpu . +``` + +- Build model image: + +``` +docker build -t robocars:1.4.1-gpu-py3 -f Dockerfile.gpu . +``` + +# Prepare training (once) + +- Create a S3 bucket for your tubes. You can use the same for model output or create another bucker for output +- Create an AWS docker registry and push your model image to it. Docker hub registry is not supported + +``` +docker tag robocars:1.4.1-gpu-py .dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3 +# you should have AWS SDK installed and login to docker +docker push .dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3 +``` + +# Run training + +- Copy your tubes to your S3 bucket. All tubes in the bucket will be used for training so make sure you keep only relevant files. We recommend to zip your tubes before upload. The training package will unzip them. +- Create a training job on AWS Sagemaker. Use create_job.sh script after replacing relevant parameters + +``` +#!/bin/bash + +#usage: create_job.sh some_job_unique_name +job_name=$1 +if [ -z $job_name ] +then + echo 'Provide job unique name' + exit 0 +fi +echo 'Creating training job '$1 + +aws sagemaker create-training-job \ + --training-job-name $job_name \ + --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \ + --algorithm-specification TrainingImage=".dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3",TrainingInputMode=File \ + --role-arn "" \ + --input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://", "S3DataDistributionType": "FullyReplicated" }} }]' \ + --output-data-config S3OutputPath=s3:// \ + --resource-config InstanceType=ml.p2.xlarge,InstanceCount=1,VolumeSizeInGB=1 \ + --stopping-condition MaxRuntimeInSeconds=1800 +``` + +- Keep an eye on job progression on AWS Sagemaker. Once finished your model is copied into the destination bucket. + +# About AWS Sagemaker + +Sagemaker provide on-demand model computing and serving. Standard algorithms can be used and on-demande Jupyter notebooks are available. However, as any hosted service, tensorflow versions are updated frequently which is not manageable because compatible versions might not be available on RaspberryPi. Sagemaker also allow "Bring Your Own Algorithm" by using a docker image for training. The resulting container must comply to Sagemaker constraints. + +Input and output data are mapped to S3 buckets: at container start, input data is copied to ``` /opt/ml/input/data/train ``` and at the end of training data in ```/opt/ml/``` is copied back to S3. + +Hyperparameters can be sent at job creation time and accessed by training code (example: ```env.hyperparameters.get('with_slide', False)```) + +# Which Tensorflow version should I pick ? + +Version 1.4.1 model is compatible with 1.8.0 tensorflow runtime + +Version 1.8.0 model is not compatible with previous tensorflow runtimes + diff --git a/build_base_gpu.sh b/build_base_gpu.sh new file mode 100755 index 0000000..74bfbf3 --- /dev/null +++ b/build_base_gpu.sh @@ -0,0 +1 @@ +docker build -t tensorflow-base:1.4.1-gpu-py3 -f Dockerfile_base_pip.gpu . diff --git a/build_gpu.sh b/build_gpu.sh new file mode 100755 index 0000000..7850f89 --- /dev/null +++ b/build_gpu.sh @@ -0,0 +1 @@ +docker build -t tensorflow:1.4.1-gpu-py3 -f Dockerfile.gpu . \ No newline at end of file diff --git a/create_job.sh b/create_job.sh new file mode 100755 index 0000000..c5dd67e --- /dev/null +++ b/create_job.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +job_name=$1 +if [ -z $job_name ] +then + echo 'Provide model name' + exit 0 +fi +echo 'Creating training job '$1 + +training_image=".dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3" +iam_role_arn="arn:aws:iam:::role/service-role/" + +aws sagemaker create-training-job \ + --training-job-name $job_name \ + --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \ + --algorithm-specification TrainingImage=$training_image,TrainingInputMode=File \ + --role-arn $iam_role_arn \ + --input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://", "S3DataDistributionType": "FullyReplicated" }} }]' \ + --output-data-config S3OutputPath=s3:// \ + --resource-config InstanceType=ml.p2.xlarge,InstanceCount=1,VolumeSizeInGB=1 \ + --stopping-condition MaxRuntimeInSeconds=1800 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..48514e0 --- /dev/null +++ b/setup.py @@ -0,0 +1,27 @@ +import os +from glob import glob +from os.path import basename +from os.path import splitext + +from setuptools import setup, find_packages + + +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + + +setup( + name='robocars_sagemaker_container', + version='1.0.0', + + packages=find_packages(where='src', exclude=('test',)), + package_dir={'': 'src'}, + py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], + + classifiers=[ + 'Programming Language :: Python :: 3.5', + ], + + install_requires=['sagemaker-container-support'], + extras_require={}, +) diff --git a/src/tf_container/__init__.py b/src/tf_container/__init__.py new file mode 100644 index 0000000..7b63c74 --- /dev/null +++ b/src/tf_container/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# or in the "license" file accompanying this file. This file is distributed +# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either +# express or implied. See the License for the specific language governing +# permissions and limitations under the License. + +from tf_container.train_entry_point import train + +__all__ = ['train'] diff --git a/src/tf_container/train_entry_point.py b/src/tf_container/train_entry_point.py new file mode 100644 index 0000000..ef5fb26 --- /dev/null +++ b/src/tf_container/train_entry_point.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +import container_support as cs + +import os +import json +import re +import zipfile +from keras.preprocessing.image import load_img, img_to_array +import numpy as np + +from keras.layers import Input, Dense, merge +from keras.models import Model +from keras.layers import Convolution2D, MaxPooling2D, Reshape, BatchNormalization +from keras.layers import Activation, Dropout, Flatten, Dense +from keras import callbacks +from tensorflow.python.client import device_lib + +def train(): + env = cs.TrainingEnvironment() + + print(device_lib.list_local_devices()) + os.system('mkdir -p logs') + + # ### Loading the files ### + # ** You need to copy all your files to the directory where you are runing this notebook into a folder named "data" ** + + numbers = re.compile(r'(\d+)') + data = [] + def get_data(root,f): + d = json.load(open(os.path.join(root,f))) + if ('pilot/throttle' in d): + return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array'],d['pilot/throttle'],d['pilot/angle']] + else: + return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array']] + def numericalSort(value): + parts = numbers.split(value) + parts[1::2] = map(int, parts[1::2]) + return parts + def unzip_file(root,f): + zip_ref = zipfile.ZipFile(os.path.join(root,f), 'r') + zip_ref.extractall(root) + zip_ref.close() + + for root, dirs, files in os.walk('/opt/ml/input/data/train'): + for f in files: + if f.endswith('.zip'): + unzip_file(root, f) + + for root, dirs, files in os.walk('/opt/ml/input/data/train'): + data.extend([get_data(root,f) for f in sorted(files, key=numericalSort) if f.startswith('record') and f.endswith('.json')]) + + # Normalize / correct data + data = [d for d in data if d[1] > 0.1] + for d in data: + if d[1] < 0.2: + d[1] = 0.2 + + # ### Loading throttle and angle ### + + angle = [d[2] for d in data] + throttle = [d[1] for d in data] + angle_array = np.array(angle) + throttle_array = np.array(throttle) + if (len(data[0]) > 5): + pilot_angle = [d[6] for d in data] + pilot_throttle = [d[5] for d in data] + pilot_angle_array = np.array(pilot_angle) + pilot_throttle_array = np.array(pilot_throttle) + else: + pilot_angle = [] + pilot_throttle = [] + + + # ### Loading images ### + images = np.array([img_to_array(load_img(os.path.join(d[3],d[4]))) for d in data],'f') + + # slide images vs orders + if env.hyperparameters.get('with_slide', False): + images = images[:len(images)-2] + angle_array = angle_array[2:] + throttle_array = throttle_array[2:] + + # ### Start training ### + def linear_bin(a): + a = a + 1 + b = round(a / (2/14)) + arr = np.zeros(15) + arr[int(b)] = 1 + return arr + + logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True) + save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='angle_out_loss', verbose=1, save_best_only=True, mode='min') + early_stop = callbacks.EarlyStopping(monitor='angle_out_loss', + min_delta=.0005, + patience=10, + verbose=1, + mode='auto') + img_in = Input(shape=(120, 160, 3), name='img_in') # First layer, input layer, Shape comes from camera.py resolution, RGB + x = img_in + x = Convolution2D(24, (5,5), strides=(2,2), activation='relu')(x) # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation + x = Convolution2D(32, (5,5), strides=(2,2), activation='relu')(x) # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion + x = Convolution2D(64, (5,5), strides=(2,2), activation='relu')(x) # 64 features, 5px5p kernal window, 2wx2h stride, relu + x = Convolution2D(64, (3,3), strides=(2,2), activation='relu')(x) # 64 features, 3px3p kernal window, 2wx2h stride, relu + x = Convolution2D(64, (3,3), strides=(1,1), activation='relu')(x) # 64 features, 3px3p kernal window, 1wx1h stride, relu + + # Possibly add MaxPooling (will make it less sensitive to position in image). Camera angle fixed, so may not to be needed + + x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected) + x = Dense(100, activation='relu')(x) # Classify the data into 100 features, make all negatives 0 + x = Dropout(.1)(x) + x = Dense(50, activation='relu')(x) + x = Dropout(.1)(x) # Randomly drop out 10% of the neurons (Prevent overfitting) + #categorical output of the angle + callbacks_list = [save_best, early_stop, logs] + angle_out = Dense(15, activation='softmax', name='angle_out')(x) # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. 15 categories and find best one based off percentage 0.0-1.0 + + #continous output of throttle + throttle_out = Dense(1, activation='relu', name='throttle_out')(x) # Reduce to 1 number, Positive number only + angle_cat_array = np.array([linear_bin(a) for a in angle_array]) + model = Model(inputs=[img_in], outputs=[angle_out, throttle_out]) + model.compile(optimizer='adam', + loss={'angle_out': 'categorical_crossentropy', + 'throttle_out': 'mean_absolute_error'}, + loss_weights={'angle_out': 0.9, 'throttle_out': .001}) + model.fit({'img_in':images},{'angle_out': angle_cat_array, 'throttle_out': throttle_array}, batch_size=32, epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)