feat(training): new features
* add flip-image option * add command to list models * add option to override image size when training is launched * add option to disable aws spot instance
This commit is contained in:
		@@ -9,8 +9,7 @@ import (
 | 
			
		||||
	"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
 | 
			
		||||
	"github.com/cyrilix/robocar-tools/pkg/awsutils"
 | 
			
		||||
	"github.com/cyrilix/robocar-tools/pkg/data"
 | 
			
		||||
	"io/fs"
 | 
			
		||||
	"io/ioutil"
 | 
			
		||||
	"github.com/cyrilix/robocar-tools/pkg/models"
 | 
			
		||||
	"log"
 | 
			
		||||
	"strconv"
 | 
			
		||||
	"time"
 | 
			
		||||
@@ -36,9 +35,9 @@ type Training struct {
 | 
			
		||||
	outputBucket string
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceSize int, outputModelFile string) error {
 | 
			
		||||
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, imgHeight, imgWidth int, sliceSize int, withFlipImage bool, outputModelFile string, enableSpotTraining bool) error {
 | 
			
		||||
	log.Printf("run training with data from %s\n", basedir)
 | 
			
		||||
	archive, err := data.BuildArchive(basedir, sliceSize)
 | 
			
		||||
	archive, err := data.BuildArchive(basedir, sliceSize, withFlipImage)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return fmt.Errorf("unable to build data archive: %w", err)
 | 
			
		||||
	}
 | 
			
		||||
@@ -54,8 +53,9 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
 | 
			
		||||
		ctx,
 | 
			
		||||
		jobName,
 | 
			
		||||
		sliceSize,
 | 
			
		||||
		120,
 | 
			
		||||
		160,
 | 
			
		||||
		imgHeight,
 | 
			
		||||
		imgWidth,
 | 
			
		||||
		enableSpotTraining,
 | 
			
		||||
	)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return fmt.Errorf("unable to run training: %w", err)
 | 
			
		||||
@@ -69,11 +69,11 @@ func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func List(bucketName string) error {
 | 
			
		||||
 | 
			
		||||
	l := zap.S()
 | 
			
		||||
	pfxInput := prefixInput
 | 
			
		||||
 | 
			
		||||
	// Create an Amazon S3 service client
 | 
			
		||||
	client := s3.NewFromConfig(mustLoadConfig())
 | 
			
		||||
	client := s3.NewFromConfig(awsutils.MustLoadConfig())
 | 
			
		||||
 | 
			
		||||
	// Get the first page of results for ListObjectsV2 for a bucket
 | 
			
		||||
	output, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
 | 
			
		||||
@@ -81,64 +81,71 @@ func List(bucketName string) error {
 | 
			
		||||
		Prefix: &pfxInput,
 | 
			
		||||
	})
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		log.Fatal(err)
 | 
			
		||||
		l.Fatal(err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	log.Println("first page results:")
 | 
			
		||||
	l.Info("first page results:")
 | 
			
		||||
	for _, object := range output.Contents {
 | 
			
		||||
		if *object.Key == pfxInput {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		log.Printf("key=%s size=%d", aws.ToString(object.Key), object.Size)
 | 
			
		||||
		l.Infof("key=%s size=%d", aws.ToString(object.Key), object.Size)
 | 
			
		||||
	}
 | 
			
		||||
	return nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int) error {
 | 
			
		||||
	client := sagemaker.NewFromConfig(mustLoadConfig())
 | 
			
		||||
	log.Printf("Start training job '%s'\n", jobName)
 | 
			
		||||
	// TODO: check train data exist
 | 
			
		||||
	jobOutput, err := client.CreateTrainingJob(
 | 
			
		||||
		ctx,
 | 
			
		||||
		&sagemaker.CreateTrainingJobInput{
 | 
			
		||||
			EnableManagedSpotTraining: true,
 | 
			
		||||
			AlgorithmSpecification: &types.AlgorithmSpecification{
 | 
			
		||||
				TrainingInputMode: types.TrainingInputModeFile,
 | 
			
		||||
				TrainingImage:     aws.String(t.ociImage),
 | 
			
		||||
			},
 | 
			
		||||
			OutputDataConfig: &types.OutputDataConfig{
 | 
			
		||||
				S3OutputPath: aws.String(t.outputBucket),
 | 
			
		||||
			},
 | 
			
		||||
			ResourceConfig: &types.ResourceConfig{
 | 
			
		||||
				InstanceCount:  1,
 | 
			
		||||
				InstanceType:   types.TrainingInstanceTypeMlP2Xlarge,
 | 
			
		||||
				VolumeSizeInGB: 1,
 | 
			
		||||
			},
 | 
			
		||||
			RoleArn: aws.String(t.roleArn),
 | 
			
		||||
			StoppingCondition: &types.StoppingCondition{
 | 
			
		||||
				MaxRuntimeInSeconds: 1800,
 | 
			
		||||
				MaxWaitTimeInSeconds: aws.Int32(3600),
 | 
			
		||||
			},
 | 
			
		||||
			TrainingJobName: aws.String(jobName),
 | 
			
		||||
			HyperParameters: map[string]string{
 | 
			
		||||
				"sagemaker_region": "eu-west-1",
 | 
			
		||||
				"slide_size":       strconv.Itoa(slideSize),
 | 
			
		||||
				"img_height":       strconv.Itoa(imgHeight),
 | 
			
		||||
				"img_width":        strconv.Itoa(imgWidth),
 | 
			
		||||
			},
 | 
			
		||||
			InputDataConfig: []types.Channel{
 | 
			
		||||
				{
 | 
			
		||||
					ChannelName: aws.String("train"),
 | 
			
		||||
					DataSource: &types.DataSource{
 | 
			
		||||
						S3DataSource: &types.S3DataSource{
 | 
			
		||||
							S3DataType:             types.S3DataTypeS3Prefix,
 | 
			
		||||
							S3Uri:                  aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
 | 
			
		||||
							S3DataDistributionType: types.S3DataDistributionFullyReplicated,
 | 
			
		||||
						},
 | 
			
		||||
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int, enableSpotTraining bool) error {
 | 
			
		||||
	l := zap.S()
 | 
			
		||||
	client := sagemaker.NewFromConfig(awsutils.MustLoadConfig())
 | 
			
		||||
	l.Infof("Start training job '%s'", jobName)
 | 
			
		||||
 | 
			
		||||
	trainingJobInput := sagemaker.CreateTrainingJobInput{
 | 
			
		||||
		EnableManagedSpotTraining: enableSpotTraining,
 | 
			
		||||
		AlgorithmSpecification: &types.AlgorithmSpecification{
 | 
			
		||||
			TrainingInputMode: types.TrainingInputModeFile,
 | 
			
		||||
			TrainingImage:     aws.String(t.ociImage),
 | 
			
		||||
		},
 | 
			
		||||
		OutputDataConfig: &types.OutputDataConfig{
 | 
			
		||||
			S3OutputPath: aws.String(t.outputBucket),
 | 
			
		||||
		},
 | 
			
		||||
		ResourceConfig: &types.ResourceConfig{
 | 
			
		||||
			InstanceCount:  1,
 | 
			
		||||
			//InstanceType:   types.TrainingInstanceTypeMlP2Xlarge,
 | 
			
		||||
			InstanceType:   types.TrainingInstanceTypeMlG4dnXlarge,
 | 
			
		||||
			VolumeSizeInGB: 1,
 | 
			
		||||
		},
 | 
			
		||||
		RoleArn: aws.String(t.roleArn),
 | 
			
		||||
		StoppingCondition: &types.StoppingCondition{
 | 
			
		||||
			MaxRuntimeInSeconds: 1800,
 | 
			
		||||
		},
 | 
			
		||||
		TrainingJobName: aws.String(jobName),
 | 
			
		||||
		HyperParameters: map[string]string{
 | 
			
		||||
			"sagemaker_region": "eu-west-1",
 | 
			
		||||
			"slide_size":       strconv.Itoa(slideSize),
 | 
			
		||||
			"img_height":       strconv.Itoa(imgHeight),
 | 
			
		||||
			"img_width":        strconv.Itoa(imgWidth),
 | 
			
		||||
		},
 | 
			
		||||
		InputDataConfig: []types.Channel{
 | 
			
		||||
			{
 | 
			
		||||
				ChannelName: aws.String("train"),
 | 
			
		||||
				DataSource: &types.DataSource{
 | 
			
		||||
					S3DataSource: &types.S3DataSource{
 | 
			
		||||
						S3DataType:             types.S3DataTypeS3Prefix,
 | 
			
		||||
						S3Uri:                  aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
 | 
			
		||||
						S3DataDistributionType: types.S3DataDistributionFullyReplicated,
 | 
			
		||||
					},
 | 
			
		||||
				},
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
	if enableSpotTraining {
 | 
			
		||||
		trainingJobInput.StoppingCondition.MaxWaitTimeInSeconds = aws.Int32(3600)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// TODO: check train data exist
 | 
			
		||||
	jobOutput, err := client.CreateTrainingJob(
 | 
			
		||||
		ctx,
 | 
			
		||||
		&trainingJobInput,
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	if err != nil {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user