feat(training): new features
* add flip-image option * add command to list models * add option to override image size when training is launched * add option to disable aws spot instance
This commit is contained in:
@ -6,17 +6,21 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/cyrilix/robocar-tools/record"
|
||||
"github.com/disintegration/imaging"
|
||||
"go.uber.org/zap"
|
||||
"image"
|
||||
"image/jpeg"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var camSubDir = "cam"
|
||||
|
||||
func WriteArchive(basedir string, archiveName string, sliceSize int) error {
|
||||
content, err := BuildArchive(basedir, sliceSize)
|
||||
func WriteArchive(basedir string, archiveName string, sliceSize int, flipImages bool) error {
|
||||
content, err := BuildArchive(basedir, sliceSize, flipImages)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
@ -30,7 +34,7 @@ func WriteArchive(basedir string, archiveName string, sliceSize int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func BuildArchive(basedir string, sliceSize int) ([]byte, error) {
|
||||
func BuildArchive(basedir string, sliceSize int, flipImages bool) ([]byte, error) {
|
||||
l := zap.S()
|
||||
l.Infof("build zip archive from %s\n", basedir)
|
||||
dirItems, err := ioutil.ReadDir(basedir)
|
||||
@ -64,9 +68,29 @@ func BuildArchive(basedir string, sliceSize int) ([]byte, error) {
|
||||
imgCams, records, err = applySlice(imgCams, records, sliceSize)
|
||||
}
|
||||
|
||||
content, err := buildArchiveContent(imgCams, records)
|
||||
// Create a buffer to write our archive to.
|
||||
buf := new(bytes.Buffer)
|
||||
// Create a new zip archive.
|
||||
w := zip.NewWriter(buf)
|
||||
|
||||
err = buildArchiveContent(w, imgCams, records, false)
|
||||
if err != nil {
|
||||
return nil , fmt.Errorf("unable to generate archive content: %w", err)
|
||||
return nil, fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
if flipImages {
|
||||
err = buildArchiveContent(w, imgCams, records, true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
err = w.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to close zip archive: %w", err)
|
||||
}
|
||||
content, err := ioutil.ReadAll(buf)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to generate archive content: %w", err)
|
||||
}
|
||||
l.Info("archive built\n")
|
||||
return content, nil
|
||||
@ -108,40 +132,41 @@ func findNamedMatches(regex *regexp.Regexp, str string) map[string]string {
|
||||
return results
|
||||
}
|
||||
|
||||
func buildArchiveContent(imgFiles []string, recordFiles []string) ([]byte, error) {
|
||||
// Create a buffer to write our archive to.
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
// Create a new zip archive.
|
||||
w := zip.NewWriter(buf)
|
||||
|
||||
err := addJsonFiles(recordFiles, imgFiles, w)
|
||||
func buildArchiveContent(w *zip.Writer, imgFiles []string, recordFiles []string, withFlipImages bool) error {
|
||||
err := addJsonFiles(recordFiles, imgFiles, withFlipImages, w)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to write json files in zip archive: %w", err)
|
||||
return fmt.Errorf("unable to write json files in zip archive: %w", err)
|
||||
}
|
||||
|
||||
err = addCamImages(imgFiles, w)
|
||||
err = addCamImages(imgFiles, withFlipImages, w)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to cam files in zip archive: %w", err)
|
||||
return fmt.Errorf("unable to cam files in zip archive: %w", err)
|
||||
}
|
||||
|
||||
err = w.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
|
||||
content, err := ioutil.ReadAll(buf)
|
||||
return content, err
|
||||
return err
|
||||
}
|
||||
|
||||
func addCamImages(imgFiles []string, w *zip.Writer) error {
|
||||
for _, img := range imgFiles {
|
||||
imgContent, err := ioutil.ReadFile(img)
|
||||
func addCamImages(imgFiles []string, flipImage bool, w *zip.Writer) error {
|
||||
for _, im := range imgFiles {
|
||||
imgContent, err := ioutil.ReadFile(im)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read img: %w", err)
|
||||
}
|
||||
|
||||
_, imgName := path.Split(img)
|
||||
_, imgName := path.Split(im)
|
||||
if flipImage {
|
||||
img, _, err := image.Decode(bytes.NewReader(imgContent))
|
||||
if err != nil {
|
||||
zap.S().Fatalf("unable to decode peg image: %v", err)
|
||||
}
|
||||
imgFlip := imaging.FlipH(img)
|
||||
var bytesBuff bytes.Buffer
|
||||
err = jpeg.Encode(&bytesBuff, imgFlip, nil)
|
||||
|
||||
imgContent = bytesBuff.Bytes()
|
||||
imgName = fmt.Sprintf("flip_%s", imgName)
|
||||
}
|
||||
|
||||
err = addToArchive(w, imgName, imgContent)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new img entry in archive: %w", err)
|
||||
@ -150,7 +175,7 @@ func addCamImages(imgFiles []string, w *zip.Writer) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
|
||||
func addJsonFiles(recordFiles []string, imgCam []string, flipImage bool, w *zip.Writer) error {
|
||||
for idx, r := range recordFiles {
|
||||
content, err := ioutil.ReadFile(r)
|
||||
if err != nil {
|
||||
@ -162,7 +187,13 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
|
||||
return fmt.Errorf("unable to unmarshal record: %w", err)
|
||||
}
|
||||
_, camName := path.Split((imgCam)[idx])
|
||||
rcd.CamImageArray = camName
|
||||
|
||||
if flipImage {
|
||||
rcd.UserAngle = rcd.UserAngle * -1
|
||||
rcd.CamImageArray = fmt.Sprintf("flip_%s", camName)
|
||||
}else {
|
||||
rcd.CamImageArray = camName
|
||||
}
|
||||
|
||||
recordBytes, err := json.Marshal(&rcd)
|
||||
if err != nil {
|
||||
@ -170,6 +201,9 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
|
||||
}
|
||||
|
||||
_, recordName := path.Split(r)
|
||||
if flipImage {
|
||||
recordName = strings.ReplaceAll(recordName, "record", "record_flip")
|
||||
}
|
||||
err = addToArchive(w, recordName, recordBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new record in archive: %w", err)
|
||||
@ -178,7 +212,7 @@ func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func addToArchive(w *zip.Writer, name string, content []byte) error {
|
||||
func addToArchive(w *zip.Writer, name string, content []byte) error {
|
||||
recordWriter, err := w.Create(name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new entry %v in archive: %w", name, err)
|
||||
|
73
pkg/models/models.go
Normal file
73
pkg/models/models.go
Normal file
@ -0,0 +1,73 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/cyrilix/robocar-tools/pkg/awsutils"
|
||||
"go.uber.org/zap"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
)
|
||||
|
||||
func ListModels(ctx context.Context, bucket string) error {
|
||||
|
||||
// Create an Amazon S3 service client
|
||||
client := s3.NewFromConfig(awsutils.MustLoadConfig())
|
||||
|
||||
// Get the first page of results for ListObjectsV2 for a bucket
|
||||
outputs, err := client.ListObjectsV2(
|
||||
ctx,
|
||||
&s3.ListObjectsV2Input{
|
||||
Bucket: aws.String(bucket),
|
||||
Prefix: aws.String("output"),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to list models in bucket %v: %w", bucket, err)
|
||||
}
|
||||
for _, output := range outputs.Contents {
|
||||
fmt.Printf("model: %s\n", *output.Key)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func DownloadArchive(ctx context.Context, bucketName, modelPath string) ([]byte, error) {
|
||||
l := zap.S().With(
|
||||
"bucket", bucketName,
|
||||
"model", modelPath,
|
||||
)
|
||||
client := s3.NewFromConfig(awsutils.MustLoadConfig())
|
||||
|
||||
l.Debug("download model")
|
||||
archive, err := client.GetObject(
|
||||
ctx,
|
||||
|
||||
&s3.GetObjectInput{
|
||||
Bucket: aws.String(bucketName),
|
||||
Key: aws.String(modelPath),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to download model: %w", err)
|
||||
}
|
||||
l.Debug("model downloaded")
|
||||
resp, err := ioutil.ReadAll(archive.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read model archive content: %w", err)
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func DownloadArchiveToFile(ctx context.Context, bucketName, modelPath, outputFile string) error {
|
||||
arch, err := DownloadArchive(ctx, bucketName, modelPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to download model '%v/%v': %w", bucketName, modelPath, err)
|
||||
}
|
||||
err = ioutil.WriteFile(outputFile, arch, os.FileMode(0755))
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write model '%s' to file '%s': %v", modelPath, outputFile, err)
|
||||
}
|
||||
return nil
|
||||
}
|
@ -9,8 +9,7 @@ import (
|
||||
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
|
||||
"github.com/cyrilix/robocar-tools/pkg/awsutils"
|
||||
"github.com/cyrilix/robocar-tools/pkg/data"
|
||||
"io/fs"
|
||||
"io/ioutil"
|
||||
"github.com/cyrilix/robocar-tools/pkg/models"
|
||||
"log"
|
||||
"strconv"
|
||||
"time"
|
||||
@ -36,9 +35,9 @@ type Training struct {
|
||||
outputBucket string
|
||||
}
|
||||
|
||||
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceSize int, outputModelFile string) error {
|
||||
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, imgHeight, imgWidth int, sliceSize int, withFlipImage bool, outputModelFile string, enableSpotTraining bool) error {
|
||||
log.Printf("run training with data from %s\n", basedir)
|
||||
archive, err := data.BuildArchive(basedir, sliceSize)
|
||||
archive, err := data.BuildArchive(basedir, sliceSize, withFlipImage)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to build data archive: %w", err)
|
||||
}
|
||||
@ -54,8 +53,9 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
|
||||
ctx,
|
||||
jobName,
|
||||
sliceSize,
|
||||
120,
|
||||
160,
|
||||
imgHeight,
|
||||
imgWidth,
|
||||
enableSpotTraining,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to run training: %w", err)
|
||||
@ -69,11 +69,11 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
|
||||
}
|
||||
|
||||
func List(bucketName string) error {
|
||||
|
||||
l := zap.S()
|
||||
pfxInput := prefixInput
|
||||
|
||||
// Create an Amazon S3 service client
|
||||
client := s3.NewFromConfig(mustLoadConfig())
|
||||
client := s3.NewFromConfig(awsutils.MustLoadConfig())
|
||||
|
||||
// Get the first page of results for ListObjectsV2 for a bucket
|
||||
output, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
|
||||
@ -81,64 +81,71 @@ func List(bucketName string) error {
|
||||
Prefix: &pfxInput,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
l.Fatal(err)
|
||||
}
|
||||
|
||||
log.Println("first page results:")
|
||||
l.Info("first page results:")
|
||||
for _, object := range output.Contents {
|
||||
if *object.Key == pfxInput {
|
||||
continue
|
||||
}
|
||||
log.Printf("key=%s size=%d", aws.ToString(object.Key), object.Size)
|
||||
l.Infof("key=%s size=%d", aws.ToString(object.Key), object.Size)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int) error {
|
||||
client := sagemaker.NewFromConfig(mustLoadConfig())
|
||||
log.Printf("Start training job '%s'\n", jobName)
|
||||
// TODO: check train data exist
|
||||
jobOutput, err := client.CreateTrainingJob(
|
||||
ctx,
|
||||
&sagemaker.CreateTrainingJobInput{
|
||||
EnableManagedSpotTraining: true,
|
||||
AlgorithmSpecification: &types.AlgorithmSpecification{
|
||||
TrainingInputMode: types.TrainingInputModeFile,
|
||||
TrainingImage: aws.String(t.ociImage),
|
||||
},
|
||||
OutputDataConfig: &types.OutputDataConfig{
|
||||
S3OutputPath: aws.String(t.outputBucket),
|
||||
},
|
||||
ResourceConfig: &types.ResourceConfig{
|
||||
InstanceCount: 1,
|
||||
InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
|
||||
VolumeSizeInGB: 1,
|
||||
},
|
||||
RoleArn: aws.String(t.roleArn),
|
||||
StoppingCondition: &types.StoppingCondition{
|
||||
MaxRuntimeInSeconds: 1800,
|
||||
MaxWaitTimeInSeconds: aws.Int32(3600),
|
||||
},
|
||||
TrainingJobName: aws.String(jobName),
|
||||
HyperParameters: map[string]string{
|
||||
"sagemaker_region": "eu-west-1",
|
||||
"slide_size": strconv.Itoa(slideSize),
|
||||
"img_height": strconv.Itoa(imgHeight),
|
||||
"img_width": strconv.Itoa(imgWidth),
|
||||
},
|
||||
InputDataConfig: []types.Channel{
|
||||
{
|
||||
ChannelName: aws.String("train"),
|
||||
DataSource: &types.DataSource{
|
||||
S3DataSource: &types.S3DataSource{
|
||||
S3DataType: types.S3DataTypeS3Prefix,
|
||||
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
|
||||
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
|
||||
},
|
||||
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int, enableSpotTraining bool) error {
|
||||
l := zap.S()
|
||||
client := sagemaker.NewFromConfig(awsutils.MustLoadConfig())
|
||||
l.Infof("Start training job '%s'", jobName)
|
||||
|
||||
trainingJobInput := sagemaker.CreateTrainingJobInput{
|
||||
EnableManagedSpotTraining: enableSpotTraining,
|
||||
AlgorithmSpecification: &types.AlgorithmSpecification{
|
||||
TrainingInputMode: types.TrainingInputModeFile,
|
||||
TrainingImage: aws.String(t.ociImage),
|
||||
},
|
||||
OutputDataConfig: &types.OutputDataConfig{
|
||||
S3OutputPath: aws.String(t.outputBucket),
|
||||
},
|
||||
ResourceConfig: &types.ResourceConfig{
|
||||
InstanceCount: 1,
|
||||
//InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
|
||||
InstanceType: types.TrainingInstanceTypeMlG4dnXlarge,
|
||||
VolumeSizeInGB: 1,
|
||||
},
|
||||
RoleArn: aws.String(t.roleArn),
|
||||
StoppingCondition: &types.StoppingCondition{
|
||||
MaxRuntimeInSeconds: 1800,
|
||||
},
|
||||
TrainingJobName: aws.String(jobName),
|
||||
HyperParameters: map[string]string{
|
||||
"sagemaker_region": "eu-west-1",
|
||||
"slide_size": strconv.Itoa(slideSize),
|
||||
"img_height": strconv.Itoa(imgHeight),
|
||||
"img_width": strconv.Itoa(imgWidth),
|
||||
},
|
||||
InputDataConfig: []types.Channel{
|
||||
{
|
||||
ChannelName: aws.String("train"),
|
||||
DataSource: &types.DataSource{
|
||||
S3DataSource: &types.S3DataSource{
|
||||
S3DataType: types.S3DataTypeS3Prefix,
|
||||
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
|
||||
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
if enableSpotTraining {
|
||||
trainingJobInput.StoppingCondition.MaxWaitTimeInSeconds = aws.Int32(3600)
|
||||
}
|
||||
|
||||
// TODO: check train data exist
|
||||
jobOutput, err := client.CreateTrainingJob(
|
||||
ctx,
|
||||
&trainingJobInput,
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
|
Reference in New Issue
Block a user