refactor: remove pipenv

Fix refactor from aws execution logs
refactor: compute only angle value
2020-03-02 19:21:21 +01:00 · 2020-03-02 19:20:42 +01:00 · 2020-02-17 19:31:06 +01:00 · 2020-02-17 19:11:48 +01:00 · 2020-02-17 19:11:29 +01:00 · 2019-11-05 19:57:54 +01:00
11 changed files with 129 additions and 129 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,2 @@
+venv
+
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+venv
+/src/robocars_sagemaker_container.egg-info/
--- a/Dockerfile.gpu
+++ b/Dockerfile.gpu
@ -6,7 +6,13 @@ WORKDIR /usr/src

 RUN python3 setup.py sdist

-FROM tensorflow-base:1.4.1-gpu-py3
+#FROM tensorflow/tensorflow:1.8.0-py3
+FROM tensorflow/tensorflow:1.15.0-gpu-py3
+
+#tensorflow-serving-api-python3==1.7.0
+COPY requirements.txt .
+RUN pip3 install --upgrade pip==20.0.2 && pip3 list && pip3 install -r requirements.txt \
+    && pip3 list

 WORKDIR /root

@ -28,5 +34,5 @@ RUN pip3 install robocars_sagemaker_container-1.0.0.tar.gz

 RUN rm robocars_sagemaker_container-1.0.0.tar.gz

-ENTRYPOINT ["entry.py"]
+ENTRYPOINT ["train"]

--- a/Dockerfile_base.gpu
+++ b/Dockerfile_base.gpu
@ -1,29 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
-
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    python3-pip python3-dev python3-setuptools \
-    && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* \
-    && pip3 install tensorflow-gpu==1.4.1
-
-RUN pip3 list && pip3 install numpy boto3 six awscli flask==0.11 Jinja2==2.9 gevent gunicorn keras==2.1.3 pillow h5py \
-    && pip3 list
-
-# Configure the build for our CUDA configuration.
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH
-ENV CI_BUILD_PYTHON=python \
-    LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH \
-    TF_NEED_CUDA=1 \
-    TF_CUDA_VERSION=8.0 \
-    TF_CUDNN_VERSION=6 \
-    TF_CUDA_COMPUTE_CAPABILITIES=3.7,6.1
-
-# Fix paths so that CUDNN can be found
-# See https://github.com/tensorflow/tensorflow/issues/8264
-RUN ls -lah /usr/local/cuda/lib64/*
-RUN mkdir /usr/lib/x86_64-linux-gnu/include/ && \
-  ln -s /usr/lib/x86_64-linux-gnu/include/cudnn.h /usr/lib/x86_64-linux-gnu/include/cudnn.h && \
-  ln -s /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h && \
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.6 /usr/local/cuda/lib64/libcudnn.so.6
--- a/Readme.md
+++ b/Readme.md
@ -4,16 +4,10 @@ Run DIY Robocars model training as Sagemaker (https://aws.amazon.com/fr/sagemake

 # Build images

- Build base image:
-
-``` 
-docker build -t robocars-base:1.4.1-gpu-py3 -f Dockerfile_base.gpu .
-```
-
 - Build model image:

 ``` 
-docker build -t robocars:1.4.1-gpu-py3 -f Dockerfile.gpu .
+docker build -t robocars:1.8.0-gpu-py3 -f Dockerfile.gpu .
 ```

 # Prepare training (once)
@ -22,9 +16,9 @@ docker build -t robocars:1.4.1-gpu-py3 -f Dockerfile.gpu .
 - Create an AWS docker registry and push your model image to it. Docker hub registry is not supported

 ``` 
-docker tag robocars:1.4.1-gpu-py <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3
+docker tag robocars:1.8.0-gpu-py <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.8.0-gpu-py3
 # you should have AWS SDK installed and login to docker
-docker push <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3
+docker push <replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.8.0-gpu-py3
 ``` 

 # Run training
@ -47,7 +41,7 @@ echo 'Creating training job '$1
 aws sagemaker create-training-job \
    --training-job-name $job_name \
    --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \
-    --algorithm-specification TrainingImage="<replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3",TrainingInputMode=File \
+    --algorithm-specification TrainingImage="<replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.8.0-gpu-py3",TrainingInputMode=File \
    --role-arn "<your_iam_sagemaker_role>" \
    --input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://<your_input_bucket>", "S3DataDistributionType": "FullyReplicated" }} }]' \
    --output-data-config S3OutputPath=s3://<your_output_bucket> \
--- a/build_base_gpu.sh
+++ b/build_base_gpu.sh
@ -1 +0,0 @@
-docker build -t tensorflow-base:1.4.1-gpu-py3 -f Dockerfile_base_pip.gpu .
--- a/build_gpu.sh
+++ b/build_gpu.sh
@ -1 +1 @@
-docker build -t tensorflow:1.4.1-gpu-py3 -f Dockerfile.gpu .
+docker build -t tensorflow:1.8.0-gpu-py3 -f Dockerfile.gpu .
--- a/create_job.sh
+++ b/create_job.sh
@ -1,22 +1,24 @@
 #!/bin/bash

 job_name=$1
-if [ -z $job_name ] 
+if [[ -z ${job_name} ]]
 then
    echo 'Provide model name'
    exit 0
-fi 
+fi
 echo 'Creating training job '$1

-training_image="<replace_me>.dkr.ecr.eu-west-1.amazonaws.com/robocars:1.4.1-gpu-py3"
-iam_role_arn="arn:aws:iam::<replace_me>:role/service-role/<replace_me>"
+training_image="117617958416.dkr.ecr.eu-west-1.amazonaws.com/robocars:tensorflow"
+iam_role_arn="arn:aws:iam::117617958416:role/robocar-training"
+DATA_BUCKET="s3://robocars-cyrilix-learning/input"
+DATA_OUTPUT="s3://robocars-cyrilix-learning/output"

 aws sagemaker create-training-job \
-    --training-job-name $job_name \
+    --training-job-name ${job_name} \
    --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \
-    --algorithm-specification TrainingImage=$training_image,TrainingInputMode=File \
-    --role-arn $iam_role_arn \
-    --input-data-config '[{ "ChannelName": "train", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3Uri": "s3://<replace_me>", "S3DataDistributionType": "FullyReplicated" }} }]' \
-    --output-data-config S3OutputPath=s3://<replace_me> \
+    --algorithm-specification TrainingImage="${training_image}",TrainingInputMode=File \
+    --role-arn ${iam_role_arn} \
+    --input-data-config "[{ \"ChannelName\": \"train\", \"DataSource\": { \"S3DataSource\": { \"S3DataType\": \"S3Prefix\", \"S3Uri\": \"${DATA_BUCKET}\", \"S3DataDistributionType\": \"FullyReplicated\" }} }]" \
+    --output-data-config S3OutputPath=${DATA_OUTPUT} \
    --resource-config InstanceType=ml.p2.xlarge,InstanceCount=1,VolumeSizeInGB=1 \
    --stopping-condition MaxRuntimeInSeconds=1800
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,12 @@
+sagemaker-container-support==1.1.3
+numpy==1.18.1
+boto3==1.12.11
+six==1.14.0
+awscli==1.18.11
+flask==0.12.5
+Jinja2==2.11.1
+gevent==1.4.0
+gunicorn==19.10.0
+keras==2.1.3
+pillow==7.0.0
+h5py==2.10.0
--- a/setup.py
+++ b/setup.py
@ -1,8 +1,8 @@
 import os
-from glob import glob
 from os.path import basename
 from os.path import splitext

+from glob import glob
 from setuptools import setup, find_packages


@ -19,9 +19,13 @@ setup(
    py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],

    classifiers=[
-        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.7',
    ],
-
+    entry_points={
+        'console_scripts': [
+            'train=tf_container.train_entry_point:train',
+        ]
+    },
    install_requires=['sagemaker-container-support'],
    extras_require={},
 )
--- a/src/tf_container/train_entry_point.py
+++ b/src/tf_container/train_entry_point.py
@ -1,21 +1,38 @@
 #!/usr/bin/env python3

-import container_support as cs
-
 import os
-import json
-import re
-import zipfile
-from keras.preprocessing.image import load_img, img_to_array
-import numpy as np

-from keras.layers import Input, Dense, merge
-from keras.models import Model
-from keras.layers import Convolution2D, MaxPooling2D, Reshape, BatchNormalization
-from keras.layers import Activation, Dropout, Flatten, Dense
+import container_support as cs
+import json
+import numpy as np
+import re
+import tensorflow as tf
+import zipfile
+from keras import backend as K
 from keras import callbacks
+from keras.layers import Convolution2D
+from keras.layers import Dropout, Flatten, Dense
+from keras.layers import Input
+from keras.models import Model
+from keras.preprocessing.image import load_img, img_to_array
 from tensorflow.python.client import device_lib

+
+def get_data(root_dir, filename):
+    print('load data from file ' + filename)
+    d = json.load(open(os.path.join(root_dir, filename)))
+    return [d['user/angle'], root_dir, d['cam/image_array']]
+
+
+numbers = re.compile(r'(\d+)')
+
+
+def unzip_file(root, f):
+    zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r')
+    zip_ref.extractall(root)
+    zip_ref.close()
+
+
 def train():
    env = cs.TrainingEnvironment()

@ -23,104 +40,95 @@ def train():
    os.system('mkdir -p logs')

    # ### Loading the files ###
-    # ** You need to copy all your files to the directory where you are runing this notebook into a folder named "data" **
+    # ** You need to copy all your files to the directory where you are runing this notebook **
+    # ** into a folder named "data"                                                          **

-    numbers = re.compile(r'(\d+)')
    data = []
-    def get_data(root,f):
-        d = json.load(open(os.path.join(root,f)))
-        if ('pilot/throttle' in d):
-            return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array'],d['pilot/throttle'],d['pilot/angle']]
-        else:
-            return [d['user/mode'],d['user/throttle'],d['user/angle'],root,d['cam/image_array']]
-    def numericalSort(value):
-        parts = numbers.split(value)
-        parts[1::2] = map(int, parts[1::2])
-        return parts
-    def unzip_file(root,f):
-        zip_ref = zipfile.ZipFile(os.path.join(root,f), 'r')
-        zip_ref.extractall(root)
-        zip_ref.close()

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
-        for f in files: 
+        for f in files:
            if f.endswith('.zip'):
                unzip_file(root, f)

    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
-        data.extend([get_data(root,f) for f in sorted(files, key=numericalSort) if f.startswith('record') and f.endswith('.json')])
+        data.extend(
+            [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')])

-    # Normalize / correct data
-    data = [d for d in data if d[1] > 0.1]
-    for d in data:
-        if d[1] < 0.2:
-            d[1] = 0.2

    # ### Loading throttle and angle ###

-    angle = [d[2] for d in data]
-    throttle = [d[1] for d in data]
+    angle = [d[0] for d in data]
    angle_array = np.array(angle)
-    throttle_array = np.array(throttle)
-    if (len(data[0]) > 5):
-        pilot_angle = [d[6] for d in data]
-        pilot_throttle = [d[5] for d in data]
-        pilot_angle_array = np.array(pilot_angle)
-        pilot_throttle_array = np.array(pilot_throttle)
-    else:
-        pilot_angle = []
-        pilot_throttle = []
-

    # ### Loading images ###
-    images = np.array([img_to_array(load_img(os.path.join(d[3],d[4]))) for d in data],'f')
+    images = np.array([img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f')

    # slide images vs orders
    if env.hyperparameters.get('with_slide', False):
-        images = images[:len(images)-2]
+        images = images[:len(images) - 2]
        angle_array = angle_array[2:]
-        throttle_array = throttle_array[2:]

    # ### Start training ###
    def linear_bin(a):
        a = a + 1
-        b = round(a / (2/14))
+        b = round(a / (2 / 14))
        arr = np.zeros(15)
        arr[int(b)] = 1
        return arr

    logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True)
-    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='angle_out_loss', verbose=1, save_best_only=True, mode='min')
-    early_stop = callbacks.EarlyStopping(monitor='angle_out_loss', 
-                                                    min_delta=.0005, 
-                                                    patience=10, 
-                                                    verbose=1, 
-                                                    mode='auto')
-    img_in = Input(shape=(120, 160, 3), name='img_in')                      # First layer, input layer, Shape comes from camera.py resolution, RGB
+    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1,
+                                          save_best_only=True, mode='min')
+    early_stop = callbacks.EarlyStopping(monitor='val_loss',
+                                         min_delta=.0005,
+                                         patience=10,
+                                         verbose=1,
+                                         mode='auto')
+    # Only for export model to tensorflow
+    sess = tf.Session()
+    K.set_session(sess)
+
+    # First layer, input layer, Shape comes from camera.py resolution, RGB
+    img_in = Input(shape=(128, 160, 3),
+                   name='img_in')
    x = img_in
-    x = Convolution2D(24, (5,5), strides=(2,2), activation='relu')(x)       # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
-    x = Convolution2D(32, (5,5), strides=(2,2), activation='relu')(x)       # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
-    x = Convolution2D(64, (5,5), strides=(2,2), activation='relu')(x)       # 64 features, 5px5p kernal window, 2wx2h stride, relu
-    x = Convolution2D(64, (3,3), strides=(2,2), activation='relu')(x)       # 64 features, 3px3p kernal window, 2wx2h stride, relu
-    x = Convolution2D(64, (3,3), strides=(1,1), activation='relu')(x)       # 64 features, 3px3p kernal window, 1wx1h stride, relu
+    # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
+    x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(x)
+    # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
+    x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(x)
+    # 64 features, 5px5p kernal window, 2wx2h stride, relu
+    x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(x)
+    # 64 features, 3px3p kernal window, 2wx2h stride, relu
+    x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x)
+    # 64 features, 3px3p kernal window, 1wx1h stride, relu
+    x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x)

    # Possibly add MaxPooling (will make it less sensitive to position in image).  Camera angle fixed, so may not to be needed

-    x = Flatten(name='flattened')(x)                                        # Flatten to 1D (Fully connected)
-    x = Dense(100, activation='relu')(x)                                    # Classify the data into 100 features, make all negatives 0
+    x = Flatten(name='flattened')(x)  # Flatten to 1D (Fully connected)
+    x = Dense(100, activation='relu')(x)  # Classify the data into 100 features, make all negatives 0
    x = Dropout(.1)(x)
    x = Dense(50, activation='relu')(x)
-    x = Dropout(.1)(x)                                                      # Randomly drop out 10% of the neurons (Prevent overfitting)
-    #categorical output of the angle
+    # Randomly drop out 10% of the neurons (Prevent overfitting)
+    x = Dropout(.1)(x)
+    # categorical output of the angle
    callbacks_list = [save_best, early_stop, logs]
-    angle_out = Dense(15, activation='softmax', name='angle_out')(x)        # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. 15 categories and find best one based off percentage 0.0-1.0
+    # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage.
+    # 15 categories and find best one based off percentage 0.0-1.0
+    angle_out = Dense(15, activation='softmax', name='angle_out')(x)

-    #continous output of throttle
-    throttle_out = Dense(1, activation='relu', name='throttle_out')(x)      # Reduce to 1 number, Positive number only
    angle_cat_array = np.array([linear_bin(a) for a in angle_array])
-    model = Model(inputs=[img_in], outputs=[angle_out, throttle_out])
+    model = Model(inputs=[img_in], outputs=[angle_out])
    model.compile(optimizer='adam',
-                loss={'angle_out': 'categorical_crossentropy', 
-                        'throttle_out': 'mean_absolute_error'},
-                loss_weights={'angle_out': 0.9, 'throttle_out': .001})
-    model.fit({'img_in':images},{'angle_out': angle_cat_array, 'throttle_out': throttle_array}, batch_size=32, epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)
+                  loss={'angle_out': 'categorical_crossentropy', },
+                  loss_weights={'angle_out': 0.9 })
+    model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=32,
+              epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)
+
+    # Save model for tensorflow using
+    builder = tf.saved_model.builder.SavedModelBuilder("/opt/ml/model/tfModel")
+
+    # Tag the model, required for Go
+    builder.add_meta_graph_and_variables(sess, ["myTag"])
+    builder.save()
+    sess.close()
Author	SHA1	Message	Date
Cyrille Nofficial	b8e011e7cd	refactor: remove pipenv	2020-03-02 19:21:21 +01:00
Cyrille Nofficial	3a376dd5a3	Fix refactor from aws execution logs	2020-03-02 19:20:42 +01:00
Cyrille Nofficial	2076b4491a	refactor: compute only angle value	2020-02-17 19:31:06 +01:00
Cyrille Nofficial	37bb0fff2d	Update docker tag	2020-02-17 19:11:48 +01:00
Cyrille Nofficial	a5354e5653	Reformat code	2020-02-17 19:11:29 +01:00
Cyrille Nofficial	84a8b11942	Export tf model	2019-11-05 19:57:54 +01:00
Cyrille Nofficial	9ec80414c9	First impl for satanas car	2019-11-05 19:45:46 +01:00
NITESCU Cristian	b81cb57230	1.8.0 image	2018-08-03 12:30:42 +02:00
				`@ -1 +0,0 @@`
				`docker build -t tensorflow-base:1.4.1-gpu-py3 -f Dockerfile_base_pip.gpu .`