From fc1945cecd0d5c351d7cf0d1c7a83f52716e897c Mon Sep 17 00:00:00 2001
From: Cyrille Nofficial <cynoffic@cyrilix.fr>
Date: Wed, 8 Jun 2022 23:20:39 +0200
Subject: [PATCH] refactor: upgrade to tensorflow 2.6.0 and rewrite models

---
 Dockerfile.gpu                        |  33 +--
 create_job.sh                         |   2 +-
 requirements.txt                      |  25 +--
 src/tf_container/train.py             | 292 ++++++++++++++++++++++++++
 src/tf_container/train_entry_point.py | 134 ------------
 5 files changed, 311 insertions(+), 175 deletions(-)
 create mode 100644 src/tf_container/train.py
 delete mode 100644 src/tf_container/train_entry_point.py

diff --git a/Dockerfile.gpu b/Dockerfile.gpu
index 9167d27..bc382b4 100644
--- a/Dockerfile.gpu
+++ b/Dockerfile.gpu
@@ -1,38 +1,15 @@
-FROM python:3.5 as builder
+FROM docker.io/tensorflow/tensorflow:2.6.0-gpu
 
-RUN mkdir -p /usr/src
-ADD . /usr/src
-WORKDIR /usr/src
-
-RUN python3 setup.py sdist
-
-#FROM tensorflow/tensorflow:1.8.0-py3
-FROM tensorflow/tensorflow:1.15.0-gpu-py3
-
-#tensorflow-serving-api-python3==1.7.0
 COPY requirements.txt .
 RUN pip3 install --upgrade pip==20.0.2 && pip3 list && pip3 install -r requirements.txt \
     && pip3 list
 
 WORKDIR /root
 
-RUN apt-get -y update && \
-    apt-get -y install curl && \
-    apt-get -y install vim && \
-    apt-get -y install iputils-ping && \
-    apt-get -y install nginx
+# copy the training script inside the container
+COPY src/tf_container/train.py /opt/ml/code/train.py
 
-# install telegraf
-RUN cd /tmp && \
-    curl -O https://dl.influxdata.com/telegraf/releases/telegraf_1.4.2-1_amd64.deb && \
-    dpkg -i telegraf_1.4.2-1_amd64.deb && \
-    cd -
+# define train.py as the script entry point
+ENV SAGEMAKER_PROGRAM train.py
 
-COPY --from=builder /usr/src/dist/robocars_sagemaker_container-1.0.0.tar.gz .
-
-RUN pip3 install robocars_sagemaker_container-1.0.0.tar.gz
-
-RUN rm robocars_sagemaker_container-1.0.0.tar.gz
-
-ENTRYPOINT ["train"]
 
diff --git a/create_job.sh b/create_job.sh
index 8985ef9..559bcea 100755
--- a/create_job.sh
+++ b/create_job.sh
@@ -15,7 +15,7 @@ DATA_OUTPUT="s3://robocars-cyrilix-learning/output"
 
 aws sagemaker create-training-job \
     --training-job-name ${job_name} \
-    --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \
+    --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true", "img_height": "120", "img_width": "160" }' \
     --algorithm-specification TrainingImage="${training_image}",TrainingInputMode=File \
     --role-arn ${iam_role_arn} \
     --input-data-config "[{ \"ChannelName\": \"train\", \"DataSource\": { \"S3DataSource\": { \"S3DataType\": \"S3Prefix\", \"S3Uri\": \"${DATA_BUCKET}\", \"S3DataDistributionType\": \"FullyReplicated\" }} }]" \
diff --git a/requirements.txt b/requirements.txt
index 779101d..c20ed13 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,13 @@
-sagemaker-container-support==1.1.3
-numpy==1.18.1
-boto3==1.12.11
-six==1.14.0
-awscli==1.18.11
-flask==0.12.5
-Jinja2==2.11.1
-gevent==1.4.0
-gunicorn==19.10.0
-keras==2.1.3
-pillow==7.0.0
-h5py==2.10.0
+#sagemaker-container-support==1.1.3
+sagemaker-training==3.9.2
+tensorflow==2.6.0
+numpy==1.19.5
+#boto3==1.18.56
+#six==1.15.0
+#awscli==1.20.56
+#flask==0.12.5
+#Jinja2==3.0.2
+#gevent==1.5.0
+#gunicorn==19.7.1
+pillow==8.3.2
+#h5py==3.1.0
diff --git a/src/tf_container/train.py b/src/tf_container/train.py
new file mode 100644
index 0000000..40b6904
--- /dev/null
+++ b/src/tf_container/train.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+
+import os
+
+# import container_support as cs
+import argparse
+import json
+
+import numpy as np
+import re
+import tensorflow as tf
+import zipfile
+# from tensorflow.keras import backend as K
+from tensorflow.keras import callbacks
+from tensorflow.keras.layers import Convolution2D
+from tensorflow.keras.layers import Dropout, Flatten, Dense
+from tensorflow.keras.layers import Input
+from tensorflow.keras.models import Model
+from tensorflow.keras.preprocessing.image import load_img, img_to_array
+from tensorflow.python.client import device_lib
+
+
+def linear_bin(a: float, N: int = 15, offset: int = 1, R: float = 2.0):
+    """
+    create a bin of length N
+    map val A to range R
+    offset one hot bin by offset, commonly R/2
+    """
+    a = a + offset
+    b = round(a / (R / (N - offset)))
+    arr = np.zeros(N)
+    b = clamp(b, 0, N - 1)
+    arr[int(b)] = 1
+    return arr
+
+
+def clamp(n, min, max):
+    if n <= min:
+        return min
+    if n >= max:
+        return max
+    return n
+
+
+def get_data(root_dir, filename):
+    print('load data from file ' + filename)
+    d = json.load(open(os.path.join(root_dir, filename)))
+    return [(d['user/angle']), root_dir, d['cam/image_array']]
+
+
+numbers = re.compile(r'(\d+)')
+
+
+def unzip_file(root, f):
+    zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r')
+    zip_ref.extractall(root)
+    zip_ref.close()
+
+
+def train(batch_size: int, slide_size: int, img_height: int, img_width: int, img_depth: int, horizon: int, drop: float):
+    # env = cs.TrainingEnvironment()
+
+    print(device_lib.list_local_devices())
+    os.system('mkdir -p logs')
+
+    # ### Loading the files ###
+    # ** You need to copy all your files to the directory where you are runing this notebook **
+    # ** into a folder named "data"                                                          **
+
+    data = []
+
+    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
+        for f in files:
+            if f.endswith('.zip'):
+                unzip_file(root, f)
+
+    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
+        data.extend(
+            [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')])
+
+    # ### Loading throttle and angle ###
+
+    angle = [d[0] for d in data]
+    angle_array = np.array(angle)
+
+    # ### Loading images ###
+    if horizon > 0:
+        images = np.array([img_to_array(load_img(os.path.join(d[1], d[2])).crop((0, horizon, img_width, img_height))) for d in data], 'f')
+    else:
+        images = np.array( [img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f')
+
+    # slide images vs orders
+    if slide_size > 0:
+        images = images[:len(images) - slide_size]
+        angle_array = angle_array[slide_size:]
+
+    # ### Start training ###
+    from datetime import datetime
+    logdir = '/opt/ml/model/logs/' + datetime.now().strftime("%Y%m%d-%H%M%S")
+    logs = callbacks.TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=True, write_images=True)
+
+    # Creates a file writer for the log directory.
+    # file_writer = tf.summary.create_file_writer(logdir)
+
+    # Using the file writer, log the reshaped image.
+    # with file_writer.as_default():
+    #    # Don't forget to reshape.
+    #    imgs = np.reshape(images[0:25], (-1, img_height, img_width, img_depth))
+    #    tf.summary.image("25 training data examples", imgs, max_outputs=25, step=0)
+
+    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1,
+                                          save_best_only=True, mode='min')
+    early_stop = callbacks.EarlyStopping(monitor='val_loss',
+                                         min_delta=.0005,
+                                         patience=5,
+                                         verbose=1,
+                                         mode='auto')
+
+    # categorical output of the angle
+    callbacks_list = [save_best, early_stop, logs]
+
+    angle_cat_array = np.array([linear_bin(float(a)) for a in angle_array])
+
+    model = default_model(input_shape=(img_height - horizon, img_width, img_depth), drop=drop)
+    #model = default_categorical(input_shape=(img_height - horizon, img_width, img_depth), drop=drop)
+
+    model.compile(optimizer='adam',
+                  loss={'angle_out': 'categorical_crossentropy', },
+                  loss_weights={'angle_out': 0.9})
+    model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=batch_size,
+              epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)
+
+    # Save model for tensorflow using
+    model.save("/opt/ml/model/tfModel", save_format="tf")
+
+    def representative_dataset():
+        for d in tf.data.Dataset.from_tensor_slices(images).batch(1).take(100):
+            yield [tf.dtypes.cast(d, tf.float32)]
+
+    converter = tf.lite.TFLiteConverter.from_keras_model(model)
+
+    # full quantization for edgeTpu
+    # https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.representative_dataset = representative_dataset
+    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+    converter.inference_input_type = tf.uint8  # or tf.int8
+    converter.inference_output_type = tf.uint8  # or tf.int8
+
+    tflite_model = converter.convert()
+
+    # Save the model.
+    with open('/opt/ml/model/model_' + str(img_width) + 'x' + str(img_height) + 'h' + str(horizon) + '.tflite',
+              'wb') as f:
+        f.write(tflite_model)
+
+
+def conv2d(filters, kernel, strides, layer_num, activation='relu'):
+    """
+    Helper function to create a standard valid-padded convolutional layer
+    with square kernel and strides and unified naming convention
+    :param filters:     channel dimension of the layer
+    :param kernel:      creates (kernel, kernel) kernel matrix dimension
+    :param strides:     creates (strides, strides) stride
+    :param layer_num:   used in labelling the layer
+    :param activation:  activation, defaults to relu
+    :return:            tf.keras Convolution2D layer
+    """
+    return Convolution2D(filters=filters,
+                         kernel_size=(kernel, kernel),
+                         strides=(strides, strides),
+                         activation=activation,
+                         name='conv2d_' + str(layer_num))
+
+
+def core_cnn_layers(img_in: Input, img_height: int, img_width: int, drop: float, l4_stride: int = 1):
+    """
+    Returns the core CNN layers that are shared among the different models,
+    like linear, imu, behavioural
+    :param img_in:          input layer of network
+    :param drop:            dropout rate
+    :param l4_stride:       4-th layer stride, default 1
+    :return:                stack of CNN layers
+    """
+    x = img_in
+    x = conv2d(img_height/5, 5, 2, 1)(x)
+    x = Dropout(drop)(x)
+    x = conv2d(img_width / 5, 5, 2, 2)(x)
+    x = Dropout(drop)(x)
+    x = conv2d(64, 5, 2, 3)(x)
+    x = Dropout(drop)(x)
+    x = conv2d(64, 3, l4_stride, 4)(x)
+    x = Dropout(drop)(x)
+    x = conv2d(64, 3, 1, 5)(x)
+    x = Dropout(drop)(x)
+    x = Flatten(name='flattened')(x)
+    return x
+
+
+def default_model(input_shape, drop):
+    # First layer, input layer, Shape comes from camera.py resolution, RGB
+    img_in = Input(shape=input_shape, name='img_in')
+    kernel_size = 5
+
+    x = img_in
+    # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
+    x = Convolution2D(input_shape[1] / kernel_size, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x)
+    x = Dropout(drop)(x)
+    # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
+    x = Convolution2D(input_shape[0] / kernel_size, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x)
+    x = Dropout(drop)(x)
+    # 64 features, 5px5p kernel window, 2wx2h stride, relu
+    x = Convolution2D(64, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x)
+    x = Dropout(drop)(x)
+    # 64 features, 3px3p kernel window, 2wx2h stride, relu
+    x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x)
+    x = Dropout(drop)(x)
+    # 64 features, 3px3p kernel window, 1wx1h stride, relu
+    x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
+    x = Dropout(drop)(x)
+
+    # Possibly add MaxPooling (will make it less sensitive to position in image).
+    # Camera angle fixed, so may not to be needed
+
+    x = Flatten(name='flattened')(x)  # Flatten to 1D (Fully connected)
+    x = Dense(100, activation='relu')(x)  # Classify the data into 100 features, make all negatives 0
+    x = Dropout(drop)(x)
+    x = Dense(50, activation='relu')(x)
+    x = Dropout(drop)(x)
+    # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage.
+    # 15 categories and find best one based off percentage 0.0-1.0
+    angle_out = Dense(15, activation='softmax', name='angle_out')(x)
+
+    model = Model(inputs=[img_in], outputs=[angle_out])
+
+    return model
+
+
+def default_n_linear(num_outputs, input_shape=(120, 160, 3), drop=0.2):
+    img_in = Input(shape=input_shape, name='img_in')
+    x = core_cnn_layers(img_in, img_width=input_shape[1], img_height=input_shape[0],  drop=drop)
+    x = Dense(100, activation='relu', name='dense_1')(x)
+    x = Dropout(drop)(x)
+    x = Dense(50, activation='relu', name='dense_2')(x)
+    x = Dropout(drop)(x)
+
+    outputs = []
+    for i in range(num_outputs):
+        outputs.append(
+            Dense(1, activation='linear', name='n_outputs' + str(i))(x))
+
+    model = Model(inputs=[img_in], outputs=outputs, name='linear')
+    return model
+
+
+def default_categorical(input_shape=(120, 160, 3), drop=0.2):
+    img_in = Input(shape=input_shape, name='img_in')
+    x = core_cnn_layers(img_in, img_width=input_shape[1], img_height=input_shape[0], drop=drop, l4_stride=2)
+    x = Dense(100, activation='relu', name="dense_1")(x)
+    x = Dropout(drop)(x)
+    x = Dense(50, activation='relu', name="dense_2")(x)
+    x = Dropout(drop)(x)
+    # Categorical output of the angle into 15 bins
+    angle_out = Dense(15, activation='softmax', name='angle_out')(x)
+
+    model = Model(inputs=[img_in], outputs=[angle_out],
+                  name='categorical')
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--slide_size", type=int, default=0)
+    parser.add_argument("--img_height", type=int, default=120)
+    parser.add_argument("--img_width", type=int, default=160)
+    parser.add_argument("--img_depth", type=int, default=3)
+    parser.add_argument("--horizon", type=int, default=0)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--drop", type=float, default=0.2)
+
+    args = parser.parse_args()
+    params = vars(args)
+    train(
+        batch_size=params["batch_size"],
+        slide_size=params["slide_size"],
+        img_height=params["img_height"],
+        img_width=params["img_width"],
+        img_depth=params["img_depth"],
+        horizon=params["horizon"],
+        drop=params["drop"],
+    )
diff --git a/src/tf_container/train_entry_point.py b/src/tf_container/train_entry_point.py
deleted file mode 100644
index a27ff80..0000000
--- a/src/tf_container/train_entry_point.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-
-import container_support as cs
-import json
-import numpy as np
-import re
-import tensorflow as tf
-import zipfile
-from keras import backend as K
-from keras import callbacks
-from keras.layers import Convolution2D
-from keras.layers import Dropout, Flatten, Dense
-from keras.layers import Input
-from keras.models import Model
-from keras.preprocessing.image import load_img, img_to_array
-from tensorflow.python.client import device_lib
-
-
-def get_data(root_dir, filename):
-    print('load data from file ' + filename)
-    d = json.load(open(os.path.join(root_dir, filename)))
-    return [d['user/angle'], root_dir, d['cam/image_array']]
-
-
-numbers = re.compile(r'(\d+)')
-
-
-def unzip_file(root, f):
-    zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r')
-    zip_ref.extractall(root)
-    zip_ref.close()
-
-
-def train():
-    env = cs.TrainingEnvironment()
-
-    print(device_lib.list_local_devices())
-    os.system('mkdir -p logs')
-
-    # ### Loading the files ###
-    # ** You need to copy all your files to the directory where you are runing this notebook **
-    # ** into a folder named "data"                                                          **
-
-    data = []
-
-    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
-        for f in files:
-            if f.endswith('.zip'):
-                unzip_file(root, f)
-
-    for root, dirs, files in os.walk('/opt/ml/input/data/train'):
-        data.extend(
-            [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')])
-
-
-    # ### Loading throttle and angle ###
-
-    angle = [d[0] for d in data]
-    angle_array = np.array(angle)
-
-    # ### Loading images ###
-    images = np.array([img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f')
-
-    # slide images vs orders
-    if env.hyperparameters.get('with_slide', False):
-        images = images[:len(images) - 2]
-        angle_array = angle_array[2:]
-
-    # ### Start training ###
-    def linear_bin(a):
-        a = a + 1
-        b = round(a / (2 / 14))
-        arr = np.zeros(15)
-        arr[int(b)] = 1
-        return arr
-
-    logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True)
-    save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1,
-                                          save_best_only=True, mode='min')
-    early_stop = callbacks.EarlyStopping(monitor='val_loss',
-                                         min_delta=.0005,
-                                         patience=10,
-                                         verbose=1,
-                                         mode='auto')
-    # Only for export model to tensorflow
-    sess = tf.Session()
-    K.set_session(sess)
-
-    # First layer, input layer, Shape comes from camera.py resolution, RGB
-    img_in = Input(shape=(128, 160, 3),
-                   name='img_in')
-    x = img_in
-    # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation
-    x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(x)
-    # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion
-    x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(x)
-    # 64 features, 5px5p kernal window, 2wx2h stride, relu
-    x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(x)
-    # 64 features, 3px3p kernal window, 2wx2h stride, relu
-    x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x)
-    # 64 features, 3px3p kernal window, 1wx1h stride, relu
-    x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x)
-
-    # Possibly add MaxPooling (will make it less sensitive to position in image).  Camera angle fixed, so may not to be needed
-
-    x = Flatten(name='flattened')(x)  # Flatten to 1D (Fully connected)
-    x = Dense(100, activation='relu')(x)  # Classify the data into 100 features, make all negatives 0
-    x = Dropout(.1)(x)
-    x = Dense(50, activation='relu')(x)
-    # Randomly drop out 10% of the neurons (Prevent overfitting)
-    x = Dropout(.1)(x)
-    # categorical output of the angle
-    callbacks_list = [save_best, early_stop, logs]
-    # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage.
-    # 15 categories and find best one based off percentage 0.0-1.0
-    angle_out = Dense(15, activation='softmax', name='angle_out')(x)
-
-    angle_cat_array = np.array([linear_bin(a) for a in angle_array])
-    model = Model(inputs=[img_in], outputs=[angle_out])
-    model.compile(optimizer='adam',
-                  loss={'angle_out': 'categorical_crossentropy', },
-                  loss_weights={'angle_out': 0.9 })
-    model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=32,
-              epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list)
-
-    # Save model for tensorflow using
-    builder = tf.saved_model.builder.SavedModelBuilder("/opt/ml/model/tfModel")
-
-    # Tag the model, required for Go
-    builder.add_meta_graph_and_variables(sess, ["myTag"])
-    builder.save()
-    sess.close()