feat(training): new features

* add flip-image option
 * add command to list models
 * add option to override image size when training is launched
 * add option to disable aws spot instance
This commit is contained in:
2021-11-24 19:31:16 +01:00
parent c69e1c20ef
commit 456f327788
37 changed files with 7232 additions and 93 deletions

View File

@ -6,17 +6,21 @@ import (
"encoding/json"
"fmt"
"github.com/cyrilix/robocar-tools/record"
"github.com/disintegration/imaging"
"go.uber.org/zap"
"image"
"image/jpeg"
"io/ioutil"
"os"
"path"
"regexp"
"strings"
)
var camSubDir = "cam"
func WriteArchive(basedir string, archiveName string, sliceSize int) error {
content, err := BuildArchive(basedir, sliceSize)
func WriteArchive(basedir string, archiveName string, sliceSize int, flipImages bool) error {
content, err := BuildArchive(basedir, sliceSize, flipImages)
if err != nil {
return fmt.Errorf("unable to build archive: %w", err)
}
@ -30,7 +34,7 @@ func WriteArchive(basedir string, archiveName string, sliceSize int) error {
return nil
}
func BuildArchive(basedir string, sliceSize int) ([]byte, error) {
func BuildArchive(basedir string, sliceSize int, flipImages bool) ([]byte, error) {
l := zap.S()
l.Infof("build zip archive from %s\n", basedir)
dirItems, err := ioutil.ReadDir(basedir)
@ -64,9 +68,29 @@ func BuildArchive(basedir string, sliceSize int) ([]byte, error) {
imgCams, records, err = applySlice(imgCams, records, sliceSize)
}
content, err := buildArchiveContent(imgCams, records)
// Create a buffer to write our archive to.
buf := new(bytes.Buffer)
// Create a new zip archive.
w := zip.NewWriter(buf)
err = buildArchiveContent(w, imgCams, records, false)
if err != nil {
return nil , fmt.Errorf("unable to generate archive content: %w", err)
return nil, fmt.Errorf("unable to build archive: %w", err)
}
if flipImages {
err = buildArchiveContent(w, imgCams, records, true)
if err != nil {
return nil, fmt.Errorf("unable to build archive: %w", err)
}
}
err = w.Close()
if err != nil {
return nil, fmt.Errorf("unable to close zip archive: %w", err)
}
content, err := ioutil.ReadAll(buf)
if err != nil {
return nil, fmt.Errorf("unable to generate archive content: %w", err)
}
l.Info("archive built\n")
return content, nil
@ -108,40 +132,41 @@ func findNamedMatches(regex *regexp.Regexp, str string) map[string]string {
return results
}
func buildArchiveContent(imgFiles []string, recordFiles []string) ([]byte, error) {
// Create a buffer to write our archive to.
buf := new(bytes.Buffer)
// Create a new zip archive.
w := zip.NewWriter(buf)
err := addJsonFiles(recordFiles, imgFiles, w)
func buildArchiveContent(w *zip.Writer, imgFiles []string, recordFiles []string, withFlipImages bool) error {
err := addJsonFiles(recordFiles, imgFiles, withFlipImages, w)
if err != nil {
return nil, fmt.Errorf("unable to write json files in zip archive: %w", err)
return fmt.Errorf("unable to write json files in zip archive: %w", err)
}
err = addCamImages(imgFiles, w)
err = addCamImages(imgFiles, withFlipImages, w)
if err != nil {
return nil, fmt.Errorf("unable to cam files in zip archive: %w", err)
return fmt.Errorf("unable to cam files in zip archive: %w", err)
}
err = w.Close()
if err != nil {
return nil, fmt.Errorf("unable to build archive: %w", err)
}
content, err := ioutil.ReadAll(buf)
return content, err
return err
}
func addCamImages(imgFiles []string, w *zip.Writer) error {
for _, img := range imgFiles {
imgContent, err := ioutil.ReadFile(img)
func addCamImages(imgFiles []string, flipImage bool, w *zip.Writer) error {
for _, im := range imgFiles {
imgContent, err := ioutil.ReadFile(im)
if err != nil {
return fmt.Errorf("unable to read img: %w", err)
}
_, imgName := path.Split(img)
_, imgName := path.Split(im)
if flipImage {
img, _, err := image.Decode(bytes.NewReader(imgContent))
if err != nil {
zap.S().Fatalf("unable to decode peg image: %v", err)
}
imgFlip := imaging.FlipH(img)
var bytesBuff bytes.Buffer
err = jpeg.Encode(&bytesBuff, imgFlip, nil)
imgContent = bytesBuff.Bytes()
imgName = fmt.Sprintf("flip_%s", imgName)
}
err = addToArchive(w, imgName, imgContent)
if err != nil {
return fmt.Errorf("unable to create new img entry in archive: %w", err)
@ -150,7 +175,7 @@ func addCamImages(imgFiles []string, w *zip.Writer) error {
return nil
}
func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
func addJsonFiles(recordFiles []string, imgCam []string, flipImage bool, w *zip.Writer) error {
for idx, r := range recordFiles {
content, err := ioutil.ReadFile(r)
if err != nil {
@ -162,7 +187,13 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
return fmt.Errorf("unable to unmarshal record: %w", err)
}
_, camName := path.Split((imgCam)[idx])
rcd.CamImageArray = camName
if flipImage {
rcd.UserAngle = rcd.UserAngle * -1
rcd.CamImageArray = fmt.Sprintf("flip_%s", camName)
}else {
rcd.CamImageArray = camName
}
recordBytes, err := json.Marshal(&rcd)
if err != nil {
@ -170,6 +201,9 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
}
_, recordName := path.Split(r)
if flipImage {
recordName = strings.ReplaceAll(recordName, "record", "record_flip")
}
err = addToArchive(w, recordName, recordBytes)
if err != nil {
return fmt.Errorf("unable to create new record in archive: %w", err)
@ -178,7 +212,7 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
return nil
}
func addToArchive(w *zip.Writer, name string, content []byte) error {
func addToArchive(w *zip.Writer, name string, content []byte) error {
recordWriter, err := w.Create(name)
if err != nil {
return fmt.Errorf("unable to create new entry %v in archive: %w", name, err)

73
pkg/models/models.go Normal file
View File

@ -0,0 +1,73 @@
package models
import (
"context"
"fmt"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/s3"
"github.com/cyrilix/robocar-tools/pkg/awsutils"
"go.uber.org/zap"
"io/ioutil"
"os"
)
func ListModels(ctx context.Context, bucket string) error {
// Create an Amazon S3 service client
client := s3.NewFromConfig(awsutils.MustLoadConfig())
// Get the first page of results for ListObjectsV2 for a bucket
outputs, err := client.ListObjectsV2(
ctx,
&s3.ListObjectsV2Input{
Bucket: aws.String(bucket),
Prefix: aws.String("output"),
},
)
if err != nil {
return fmt.Errorf("unable to list models in bucket %v: %w", bucket, err)
}
for _, output := range outputs.Contents {
fmt.Printf("model: %s\n", *output.Key)
}
return nil
}
func DownloadArchive(ctx context.Context, bucketName, modelPath string) ([]byte, error) {
l := zap.S().With(
"bucket", bucketName,
"model", modelPath,
)
client := s3.NewFromConfig(awsutils.MustLoadConfig())
l.Debug("download model")
archive, err := client.GetObject(
ctx,
&s3.GetObjectInput{
Bucket: aws.String(bucketName),
Key: aws.String(modelPath),
})
if err != nil {
return nil, fmt.Errorf("unable to download model: %w", err)
}
l.Debug("model downloaded")
resp, err := ioutil.ReadAll(archive.Body)
if err != nil {
return nil, fmt.Errorf("unable to read model archive content: %w", err)
}
return resp, nil
}
func DownloadArchiveToFile(ctx context.Context, bucketName, modelPath, outputFile string) error {
arch, err := DownloadArchive(ctx, bucketName, modelPath)
if err != nil {
return fmt.Errorf("unable to download model '%v/%v': %w", bucketName, modelPath, err)
}
err = ioutil.WriteFile(outputFile, arch, os.FileMode(0755))
if err != nil {
return fmt.Errorf("unable to write model '%s' to file '%s': %v", modelPath, outputFile, err)
}
return nil
}

View File

@ -9,8 +9,7 @@ import (
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
"github.com/cyrilix/robocar-tools/pkg/awsutils"
"github.com/cyrilix/robocar-tools/pkg/data"
"io/fs"
"io/ioutil"
"github.com/cyrilix/robocar-tools/pkg/models"
"log"
"strconv"
"time"
@ -36,9 +35,9 @@ type Training struct {
outputBucket string
}
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceSize int, outputModelFile string) error {
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, imgHeight, imgWidth int, sliceSize int, withFlipImage bool, outputModelFile string, enableSpotTraining bool) error {
log.Printf("run training with data from %s\n", basedir)
archive, err := data.BuildArchive(basedir, sliceSize)
archive, err := data.BuildArchive(basedir, sliceSize, withFlipImage)
if err != nil {
return fmt.Errorf("unable to build data archive: %w", err)
}
@ -54,8 +53,9 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
ctx,
jobName,
sliceSize,
120,
160,
imgHeight,
imgWidth,
enableSpotTraining,
)
if err != nil {
return fmt.Errorf("unable to run training: %w", err)
@ -69,11 +69,11 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
}
func List(bucketName string) error {
l := zap.S()
pfxInput := prefixInput
// Create an Amazon S3 service client
client := s3.NewFromConfig(mustLoadConfig())
client := s3.NewFromConfig(awsutils.MustLoadConfig())
// Get the first page of results for ListObjectsV2 for a bucket
output, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
@ -81,64 +81,71 @@ func List(bucketName string) error {
Prefix: &pfxInput,
})
if err != nil {
log.Fatal(err)
l.Fatal(err)
}
log.Println("first page results:")
l.Info("first page results:")
for _, object := range output.Contents {
if *object.Key == pfxInput {
continue
}
log.Printf("key=%s size=%d", aws.ToString(object.Key), object.Size)
l.Infof("key=%s size=%d", aws.ToString(object.Key), object.Size)
}
return nil
}
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int) error {
client := sagemaker.NewFromConfig(mustLoadConfig())
log.Printf("Start training job '%s'\n", jobName)
// TODO: check train data exist
jobOutput, err := client.CreateTrainingJob(
ctx,
&sagemaker.CreateTrainingJobInput{
EnableManagedSpotTraining: true,
AlgorithmSpecification: &types.AlgorithmSpecification{
TrainingInputMode: types.TrainingInputModeFile,
TrainingImage: aws.String(t.ociImage),
},
OutputDataConfig: &types.OutputDataConfig{
S3OutputPath: aws.String(t.outputBucket),
},
ResourceConfig: &types.ResourceConfig{
InstanceCount: 1,
InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
VolumeSizeInGB: 1,
},
RoleArn: aws.String(t.roleArn),
StoppingCondition: &types.StoppingCondition{
MaxRuntimeInSeconds: 1800,
MaxWaitTimeInSeconds: aws.Int32(3600),
},
TrainingJobName: aws.String(jobName),
HyperParameters: map[string]string{
"sagemaker_region": "eu-west-1",
"slide_size": strconv.Itoa(slideSize),
"img_height": strconv.Itoa(imgHeight),
"img_width": strconv.Itoa(imgWidth),
},
InputDataConfig: []types.Channel{
{
ChannelName: aws.String("train"),
DataSource: &types.DataSource{
S3DataSource: &types.S3DataSource{
S3DataType: types.S3DataTypeS3Prefix,
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
},
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int, enableSpotTraining bool) error {
l := zap.S()
client := sagemaker.NewFromConfig(awsutils.MustLoadConfig())
l.Infof("Start training job '%s'", jobName)
trainingJobInput := sagemaker.CreateTrainingJobInput{
EnableManagedSpotTraining: enableSpotTraining,
AlgorithmSpecification: &types.AlgorithmSpecification{
TrainingInputMode: types.TrainingInputModeFile,
TrainingImage: aws.String(t.ociImage),
},
OutputDataConfig: &types.OutputDataConfig{
S3OutputPath: aws.String(t.outputBucket),
},
ResourceConfig: &types.ResourceConfig{
InstanceCount: 1,
//InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
InstanceType: types.TrainingInstanceTypeMlG4dnXlarge,
VolumeSizeInGB: 1,
},
RoleArn: aws.String(t.roleArn),
StoppingCondition: &types.StoppingCondition{
MaxRuntimeInSeconds: 1800,
},
TrainingJobName: aws.String(jobName),
HyperParameters: map[string]string{
"sagemaker_region": "eu-west-1",
"slide_size": strconv.Itoa(slideSize),
"img_height": strconv.Itoa(imgHeight),
"img_width": strconv.Itoa(imgWidth),
},
InputDataConfig: []types.Channel{
{
ChannelName: aws.String("train"),
DataSource: &types.DataSource{
S3DataSource: &types.S3DataSource{
S3DataType: types.S3DataTypeS3Prefix,
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
},
},
},
},
}
if enableSpotTraining {
trainingJobInput.StoppingCondition.MaxWaitTimeInSeconds = aws.Int32(3600)
}
// TODO: check train data exist
jobOutput, err := client.CreateTrainingJob(
ctx,
&trainingJobInput,
)
if err != nil {