diff --git a/Dockerfile.gpu b/Dockerfile.gpu index 9167d27..bc382b4 100644 --- a/Dockerfile.gpu +++ b/Dockerfile.gpu @@ -1,38 +1,15 @@ -FROM python:3.5 as builder +FROM docker.io/tensorflow/tensorflow:2.6.0-gpu -RUN mkdir -p /usr/src -ADD . /usr/src -WORKDIR /usr/src - -RUN python3 setup.py sdist - -#FROM tensorflow/tensorflow:1.8.0-py3 -FROM tensorflow/tensorflow:1.15.0-gpu-py3 - -#tensorflow-serving-api-python3==1.7.0 COPY requirements.txt . RUN pip3 install --upgrade pip==20.0.2 && pip3 list && pip3 install -r requirements.txt \ && pip3 list WORKDIR /root -RUN apt-get -y update && \ - apt-get -y install curl && \ - apt-get -y install vim && \ - apt-get -y install iputils-ping && \ - apt-get -y install nginx +# copy the training script inside the container +COPY src/tf_container/train.py /opt/ml/code/train.py -# install telegraf -RUN cd /tmp && \ - curl -O https://dl.influxdata.com/telegraf/releases/telegraf_1.4.2-1_amd64.deb && \ - dpkg -i telegraf_1.4.2-1_amd64.deb && \ - cd - +# define train.py as the script entry point +ENV SAGEMAKER_PROGRAM train.py -COPY --from=builder /usr/src/dist/robocars_sagemaker_container-1.0.0.tar.gz . - -RUN pip3 install robocars_sagemaker_container-1.0.0.tar.gz - -RUN rm robocars_sagemaker_container-1.0.0.tar.gz - -ENTRYPOINT ["train"] diff --git a/create_job.sh b/create_job.sh index 8985ef9..559bcea 100755 --- a/create_job.sh +++ b/create_job.sh @@ -15,7 +15,7 @@ DATA_OUTPUT="s3://robocars-cyrilix-learning/output" aws sagemaker create-training-job \ --training-job-name ${job_name} \ - --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true" }' \ + --hyper-parameters '{ "sagemaker_region": "\"eu-west-1\"", "with_slide": "true", "img_height": "120", "img_width": "160" }' \ --algorithm-specification TrainingImage="${training_image}",TrainingInputMode=File \ --role-arn ${iam_role_arn} \ --input-data-config "[{ \"ChannelName\": \"train\", \"DataSource\": { \"S3DataSource\": { \"S3DataType\": \"S3Prefix\", \"S3Uri\": \"${DATA_BUCKET}\", \"S3DataDistributionType\": \"FullyReplicated\" }} }]" \ diff --git a/requirements.txt b/requirements.txt index 779101d..c20ed13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ -sagemaker-container-support==1.1.3 -numpy==1.18.1 -boto3==1.12.11 -six==1.14.0 -awscli==1.18.11 -flask==0.12.5 -Jinja2==2.11.1 -gevent==1.4.0 -gunicorn==19.10.0 -keras==2.1.3 -pillow==7.0.0 -h5py==2.10.0 +#sagemaker-container-support==1.1.3 +sagemaker-training==3.9.2 +tensorflow==2.6.0 +numpy==1.19.5 +#boto3==1.18.56 +#six==1.15.0 +#awscli==1.20.56 +#flask==0.12.5 +#Jinja2==3.0.2 +#gevent==1.5.0 +#gunicorn==19.7.1 +pillow==8.3.2 +#h5py==3.1.0 diff --git a/src/tf_container/train.py b/src/tf_container/train.py new file mode 100644 index 0000000..40b6904 --- /dev/null +++ b/src/tf_container/train.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 + +import os + +# import container_support as cs +import argparse +import json + +import numpy as np +import re +import tensorflow as tf +import zipfile +# from tensorflow.keras import backend as K +from tensorflow.keras import callbacks +from tensorflow.keras.layers import Convolution2D +from tensorflow.keras.layers import Dropout, Flatten, Dense +from tensorflow.keras.layers import Input +from tensorflow.keras.models import Model +from tensorflow.keras.preprocessing.image import load_img, img_to_array +from tensorflow.python.client import device_lib + + +def linear_bin(a: float, N: int = 15, offset: int = 1, R: float = 2.0): + """ + create a bin of length N + map val A to range R + offset one hot bin by offset, commonly R/2 + """ + a = a + offset + b = round(a / (R / (N - offset))) + arr = np.zeros(N) + b = clamp(b, 0, N - 1) + arr[int(b)] = 1 + return arr + + +def clamp(n, min, max): + if n <= min: + return min + if n >= max: + return max + return n + + +def get_data(root_dir, filename): + print('load data from file ' + filename) + d = json.load(open(os.path.join(root_dir, filename))) + return [(d['user/angle']), root_dir, d['cam/image_array']] + + +numbers = re.compile(r'(\d+)') + + +def unzip_file(root, f): + zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r') + zip_ref.extractall(root) + zip_ref.close() + + +def train(batch_size: int, slide_size: int, img_height: int, img_width: int, img_depth: int, horizon: int, drop: float): + # env = cs.TrainingEnvironment() + + print(device_lib.list_local_devices()) + os.system('mkdir -p logs') + + # ### Loading the files ### + # ** You need to copy all your files to the directory where you are runing this notebook ** + # ** into a folder named "data" ** + + data = [] + + for root, dirs, files in os.walk('/opt/ml/input/data/train'): + for f in files: + if f.endswith('.zip'): + unzip_file(root, f) + + for root, dirs, files in os.walk('/opt/ml/input/data/train'): + data.extend( + [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')]) + + # ### Loading throttle and angle ### + + angle = [d[0] for d in data] + angle_array = np.array(angle) + + # ### Loading images ### + if horizon > 0: + images = np.array([img_to_array(load_img(os.path.join(d[1], d[2])).crop((0, horizon, img_width, img_height))) for d in data], 'f') + else: + images = np.array( [img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f') + + # slide images vs orders + if slide_size > 0: + images = images[:len(images) - slide_size] + angle_array = angle_array[slide_size:] + + # ### Start training ### + from datetime import datetime + logdir = '/opt/ml/model/logs/' + datetime.now().strftime("%Y%m%d-%H%M%S") + logs = callbacks.TensorBoard(log_dir=logdir, histogram_freq=0, write_graph=True, write_images=True) + + # Creates a file writer for the log directory. + # file_writer = tf.summary.create_file_writer(logdir) + + # Using the file writer, log the reshaped image. + # with file_writer.as_default(): + # # Don't forget to reshape. + # imgs = np.reshape(images[0:25], (-1, img_height, img_width, img_depth)) + # tf.summary.image("25 training data examples", imgs, max_outputs=25, step=0) + + save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1, + save_best_only=True, mode='min') + early_stop = callbacks.EarlyStopping(monitor='val_loss', + min_delta=.0005, + patience=5, + verbose=1, + mode='auto') + + # categorical output of the angle + callbacks_list = [save_best, early_stop, logs] + + angle_cat_array = np.array([linear_bin(float(a)) for a in angle_array]) + + model = default_model(input_shape=(img_height - horizon, img_width, img_depth), drop=drop) + #model = default_categorical(input_shape=(img_height - horizon, img_width, img_depth), drop=drop) + + model.compile(optimizer='adam', + loss={'angle_out': 'categorical_crossentropy', }, + loss_weights={'angle_out': 0.9}) + model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=batch_size, + epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list) + + # Save model for tensorflow using + model.save("/opt/ml/model/tfModel", save_format="tf") + + def representative_dataset(): + for d in tf.data.Dataset.from_tensor_slices(images).batch(1).take(100): + yield [tf.dtypes.cast(d, tf.float32)] + + converter = tf.lite.TFLiteConverter.from_keras_model(model) + + # full quantization for edgeTpu + # https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.representative_dataset = representative_dataset + converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] + converter.inference_input_type = tf.uint8 # or tf.int8 + converter.inference_output_type = tf.uint8 # or tf.int8 + + tflite_model = converter.convert() + + # Save the model. + with open('/opt/ml/model/model_' + str(img_width) + 'x' + str(img_height) + 'h' + str(horizon) + '.tflite', + 'wb') as f: + f.write(tflite_model) + + +def conv2d(filters, kernel, strides, layer_num, activation='relu'): + """ + Helper function to create a standard valid-padded convolutional layer + with square kernel and strides and unified naming convention + :param filters: channel dimension of the layer + :param kernel: creates (kernel, kernel) kernel matrix dimension + :param strides: creates (strides, strides) stride + :param layer_num: used in labelling the layer + :param activation: activation, defaults to relu + :return: tf.keras Convolution2D layer + """ + return Convolution2D(filters=filters, + kernel_size=(kernel, kernel), + strides=(strides, strides), + activation=activation, + name='conv2d_' + str(layer_num)) + + +def core_cnn_layers(img_in: Input, img_height: int, img_width: int, drop: float, l4_stride: int = 1): + """ + Returns the core CNN layers that are shared among the different models, + like linear, imu, behavioural + :param img_in: input layer of network + :param drop: dropout rate + :param l4_stride: 4-th layer stride, default 1 + :return: stack of CNN layers + """ + x = img_in + x = conv2d(img_height/5, 5, 2, 1)(x) + x = Dropout(drop)(x) + x = conv2d(img_width / 5, 5, 2, 2)(x) + x = Dropout(drop)(x) + x = conv2d(64, 5, 2, 3)(x) + x = Dropout(drop)(x) + x = conv2d(64, 3, l4_stride, 4)(x) + x = Dropout(drop)(x) + x = conv2d(64, 3, 1, 5)(x) + x = Dropout(drop)(x) + x = Flatten(name='flattened')(x) + return x + + +def default_model(input_shape, drop): + # First layer, input layer, Shape comes from camera.py resolution, RGB + img_in = Input(shape=input_shape, name='img_in') + kernel_size = 5 + + x = img_in + # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation + x = Convolution2D(input_shape[1] / kernel_size, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x) + x = Dropout(drop)(x) + # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion + x = Convolution2D(input_shape[0] / kernel_size, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x) + x = Dropout(drop)(x) + # 64 features, 5px5p kernel window, 2wx2h stride, relu + x = Convolution2D(64, (kernel_size, kernel_size), strides=(2, 2), activation='relu')(x) + x = Dropout(drop)(x) + # 64 features, 3px3p kernel window, 2wx2h stride, relu + x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x) + x = Dropout(drop)(x) + # 64 features, 3px3p kernel window, 1wx1h stride, relu + x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x) + x = Dropout(drop)(x) + + # Possibly add MaxPooling (will make it less sensitive to position in image). + # Camera angle fixed, so may not to be needed + + x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected) + x = Dense(100, activation='relu')(x) # Classify the data into 100 features, make all negatives 0 + x = Dropout(drop)(x) + x = Dense(50, activation='relu')(x) + x = Dropout(drop)(x) + # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. + # 15 categories and find best one based off percentage 0.0-1.0 + angle_out = Dense(15, activation='softmax', name='angle_out')(x) + + model = Model(inputs=[img_in], outputs=[angle_out]) + + return model + + +def default_n_linear(num_outputs, input_shape=(120, 160, 3), drop=0.2): + img_in = Input(shape=input_shape, name='img_in') + x = core_cnn_layers(img_in, img_width=input_shape[1], img_height=input_shape[0], drop=drop) + x = Dense(100, activation='relu', name='dense_1')(x) + x = Dropout(drop)(x) + x = Dense(50, activation='relu', name='dense_2')(x) + x = Dropout(drop)(x) + + outputs = [] + for i in range(num_outputs): + outputs.append( + Dense(1, activation='linear', name='n_outputs' + str(i))(x)) + + model = Model(inputs=[img_in], outputs=outputs, name='linear') + return model + + +def default_categorical(input_shape=(120, 160, 3), drop=0.2): + img_in = Input(shape=input_shape, name='img_in') + x = core_cnn_layers(img_in, img_width=input_shape[1], img_height=input_shape[0], drop=drop, l4_stride=2) + x = Dense(100, activation='relu', name="dense_1")(x) + x = Dropout(drop)(x) + x = Dense(50, activation='relu', name="dense_2")(x) + x = Dropout(drop)(x) + # Categorical output of the angle into 15 bins + angle_out = Dense(15, activation='softmax', name='angle_out')(x) + + model = Model(inputs=[img_in], outputs=[angle_out], + name='categorical') + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--slide_size", type=int, default=0) + parser.add_argument("--img_height", type=int, default=120) + parser.add_argument("--img_width", type=int, default=160) + parser.add_argument("--img_depth", type=int, default=3) + parser.add_argument("--horizon", type=int, default=0) + parser.add_argument("--batch_size", type=int, default=32) + parser.add_argument("--drop", type=float, default=0.2) + + args = parser.parse_args() + params = vars(args) + train( + batch_size=params["batch_size"], + slide_size=params["slide_size"], + img_height=params["img_height"], + img_width=params["img_width"], + img_depth=params["img_depth"], + horizon=params["horizon"], + drop=params["drop"], + ) diff --git a/src/tf_container/train_entry_point.py b/src/tf_container/train_entry_point.py deleted file mode 100644 index a27ff80..0000000 --- a/src/tf_container/train_entry_point.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python3 - -import os - -import container_support as cs -import json -import numpy as np -import re -import tensorflow as tf -import zipfile -from keras import backend as K -from keras import callbacks -from keras.layers import Convolution2D -from keras.layers import Dropout, Flatten, Dense -from keras.layers import Input -from keras.models import Model -from keras.preprocessing.image import load_img, img_to_array -from tensorflow.python.client import device_lib - - -def get_data(root_dir, filename): - print('load data from file ' + filename) - d = json.load(open(os.path.join(root_dir, filename))) - return [d['user/angle'], root_dir, d['cam/image_array']] - - -numbers = re.compile(r'(\d+)') - - -def unzip_file(root, f): - zip_ref = zipfile.ZipFile(os.path.join(root, f), 'r') - zip_ref.extractall(root) - zip_ref.close() - - -def train(): - env = cs.TrainingEnvironment() - - print(device_lib.list_local_devices()) - os.system('mkdir -p logs') - - # ### Loading the files ### - # ** You need to copy all your files to the directory where you are runing this notebook ** - # ** into a folder named "data" ** - - data = [] - - for root, dirs, files in os.walk('/opt/ml/input/data/train'): - for f in files: - if f.endswith('.zip'): - unzip_file(root, f) - - for root, dirs, files in os.walk('/opt/ml/input/data/train'): - data.extend( - [get_data(root, f) for f in sorted(files, key=str.lower) if f.startswith('record') and f.endswith('.json')]) - - - # ### Loading throttle and angle ### - - angle = [d[0] for d in data] - angle_array = np.array(angle) - - # ### Loading images ### - images = np.array([img_to_array(load_img(os.path.join(d[1], d[2]))) for d in data], 'f') - - # slide images vs orders - if env.hyperparameters.get('with_slide', False): - images = images[:len(images) - 2] - angle_array = angle_array[2:] - - # ### Start training ### - def linear_bin(a): - a = a + 1 - b = round(a / (2 / 14)) - arr = np.zeros(15) - arr[int(b)] = 1 - return arr - - logs = callbacks.TensorBoard(log_dir='logs', histogram_freq=0, write_graph=True, write_images=True) - save_best = callbacks.ModelCheckpoint('/opt/ml/model/model_cat', monitor='val_loss', verbose=1, - save_best_only=True, mode='min') - early_stop = callbacks.EarlyStopping(monitor='val_loss', - min_delta=.0005, - patience=10, - verbose=1, - mode='auto') - # Only for export model to tensorflow - sess = tf.Session() - K.set_session(sess) - - # First layer, input layer, Shape comes from camera.py resolution, RGB - img_in = Input(shape=(128, 160, 3), - name='img_in') - x = img_in - # 24 features, 5 pixel x 5 pixel kernel (convolution, feauture) window, 2wx2h stride, relu activation - x = Convolution2D(24, (5, 5), strides=(2, 2), activation='relu')(x) - # 32 features, 5px5p kernel window, 2wx2h stride, relu activatiion - x = Convolution2D(32, (5, 5), strides=(2, 2), activation='relu')(x) - # 64 features, 5px5p kernal window, 2wx2h stride, relu - x = Convolution2D(64, (5, 5), strides=(2, 2), activation='relu')(x) - # 64 features, 3px3p kernal window, 2wx2h stride, relu - x = Convolution2D(64, (3, 3), strides=(2, 2), activation='relu')(x) - # 64 features, 3px3p kernal window, 1wx1h stride, relu - x = Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')(x) - - # Possibly add MaxPooling (will make it less sensitive to position in image). Camera angle fixed, so may not to be needed - - x = Flatten(name='flattened')(x) # Flatten to 1D (Fully connected) - x = Dense(100, activation='relu')(x) # Classify the data into 100 features, make all negatives 0 - x = Dropout(.1)(x) - x = Dense(50, activation='relu')(x) - # Randomly drop out 10% of the neurons (Prevent overfitting) - x = Dropout(.1)(x) - # categorical output of the angle - callbacks_list = [save_best, early_stop, logs] - # Connect every input with every output and output 15 hidden units. Use Softmax to give percentage. - # 15 categories and find best one based off percentage 0.0-1.0 - angle_out = Dense(15, activation='softmax', name='angle_out')(x) - - angle_cat_array = np.array([linear_bin(a) for a in angle_array]) - model = Model(inputs=[img_in], outputs=[angle_out]) - model.compile(optimizer='adam', - loss={'angle_out': 'categorical_crossentropy', }, - loss_weights={'angle_out': 0.9 }) - model.fit({'img_in': images}, {'angle_out': angle_cat_array, }, batch_size=32, - epochs=100, verbose=1, validation_split=0.2, shuffle=True, callbacks=callbacks_list) - - # Save model for tensorflow using - builder = tf.saved_model.builder.SavedModelBuilder("/opt/ml/model/tfModel") - - # Tag the model, required for Go - builder.add_meta_graph_and_variables(sess, ["myTag"]) - builder.save() - sess.close()