2021-10-17 17:15:44 +00:00
|
|
|
package train
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/aws"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/service/s3"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/service/sagemaker"
|
|
|
|
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
|
2021-11-24 18:10:52 +00:00
|
|
|
"github.com/cyrilix/robocar-tools/pkg/awsutils"
|
2021-10-17 17:15:44 +00:00
|
|
|
"github.com/cyrilix/robocar-tools/pkg/data"
|
2021-11-24 18:31:16 +00:00
|
|
|
"github.com/cyrilix/robocar-tools/pkg/models"
|
2021-11-24 18:32:24 +00:00
|
|
|
"go.uber.org/zap"
|
2021-10-17 17:15:44 +00:00
|
|
|
"strconv"
|
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
|
|
|
func New(bucketName string, ociImage, roleArn string) *Training {
|
|
|
|
return &Training{
|
2021-11-24 18:10:52 +00:00
|
|
|
config: awsutils.MustLoadConfig(),
|
2021-10-17 17:15:44 +00:00
|
|
|
bucketName: bucketName,
|
|
|
|
ociImage: ociImage,
|
|
|
|
roleArn: roleArn,
|
|
|
|
prefixInput: prefixInput,
|
|
|
|
outputBucket: fmt.Sprintf("s3://%s/output", bucketName),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type Training struct {
|
|
|
|
config aws.Config
|
|
|
|
bucketName string
|
|
|
|
ociImage string
|
|
|
|
roleArn string
|
|
|
|
prefixInput string
|
|
|
|
outputBucket string
|
|
|
|
}
|
|
|
|
|
2022-06-09 10:19:54 +00:00
|
|
|
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, imgWidth, imgHeight, sliceSize int, horizon int, withFlipImage bool, outputModelFile string, enableSpotTraining bool) error {
|
2021-11-24 18:32:24 +00:00
|
|
|
l := zap.S()
|
|
|
|
l.Infof("run training with data from %s", basedir)
|
2022-06-09 10:19:54 +00:00
|
|
|
archive, err := data.BuildArchive(basedir, sliceSize, imgWidth, imgHeight, horizon, withFlipImage)
|
2021-10-17 17:15:44 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to build data archive: %w", err)
|
|
|
|
}
|
2021-11-24 18:32:24 +00:00
|
|
|
l.Info("")
|
2021-10-17 17:15:44 +00:00
|
|
|
|
|
|
|
err = t.UploadArchive(ctx, archive)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to upload data arrchive: %w", err)
|
|
|
|
}
|
2021-11-24 18:32:24 +00:00
|
|
|
l.Info("")
|
2021-10-17 17:15:44 +00:00
|
|
|
|
|
|
|
err = t.runTraining(
|
|
|
|
ctx,
|
|
|
|
jobName,
|
|
|
|
sliceSize,
|
2021-11-24 18:31:16 +00:00
|
|
|
imgHeight,
|
|
|
|
imgWidth,
|
|
|
|
enableSpotTraining,
|
2021-10-17 17:15:44 +00:00
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to run training: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
err = t.GetTrainingOutput(ctx, jobName, outputModelFile)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to get output model file '%s': %w", outputModelFile, err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func List(bucketName string) error {
|
2021-11-24 18:31:16 +00:00
|
|
|
l := zap.S()
|
2021-10-17 17:15:44 +00:00
|
|
|
pfxInput := prefixInput
|
|
|
|
|
|
|
|
// Create an Amazon S3 service client
|
2021-11-24 18:31:16 +00:00
|
|
|
client := s3.NewFromConfig(awsutils.MustLoadConfig())
|
2021-10-17 17:15:44 +00:00
|
|
|
|
|
|
|
// Get the first page of results for ListObjectsV2 for a bucket
|
|
|
|
output, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
|
|
|
|
Bucket: aws.String(bucketName),
|
|
|
|
Prefix: &pfxInput,
|
|
|
|
})
|
|
|
|
if err != nil {
|
2021-11-24 18:31:16 +00:00
|
|
|
l.Fatal(err)
|
2021-10-17 17:15:44 +00:00
|
|
|
}
|
|
|
|
|
2021-11-24 18:31:16 +00:00
|
|
|
l.Info("first page results:")
|
2021-10-17 17:15:44 +00:00
|
|
|
for _, object := range output.Contents {
|
|
|
|
if *object.Key == pfxInput {
|
|
|
|
continue
|
|
|
|
}
|
2021-11-24 18:31:16 +00:00
|
|
|
l.Infof("key=%s size=%d", aws.ToString(object.Key), object.Size)
|
2021-10-17 17:15:44 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-11-24 18:31:16 +00:00
|
|
|
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int, enableSpotTraining bool) error {
|
|
|
|
l := zap.S()
|
|
|
|
client := sagemaker.NewFromConfig(awsutils.MustLoadConfig())
|
|
|
|
l.Infof("Start training job '%s'", jobName)
|
|
|
|
|
|
|
|
trainingJobInput := sagemaker.CreateTrainingJobInput{
|
|
|
|
EnableManagedSpotTraining: enableSpotTraining,
|
|
|
|
AlgorithmSpecification: &types.AlgorithmSpecification{
|
|
|
|
TrainingInputMode: types.TrainingInputModeFile,
|
|
|
|
TrainingImage: aws.String(t.ociImage),
|
|
|
|
},
|
|
|
|
OutputDataConfig: &types.OutputDataConfig{
|
|
|
|
S3OutputPath: aws.String(t.outputBucket),
|
|
|
|
},
|
|
|
|
ResourceConfig: &types.ResourceConfig{
|
2022-06-09 10:19:54 +00:00
|
|
|
InstanceCount: 1,
|
2021-11-24 18:31:16 +00:00
|
|
|
//InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
|
|
|
|
InstanceType: types.TrainingInstanceTypeMlG4dnXlarge,
|
|
|
|
VolumeSizeInGB: 1,
|
|
|
|
},
|
|
|
|
RoleArn: aws.String(t.roleArn),
|
|
|
|
StoppingCondition: &types.StoppingCondition{
|
|
|
|
MaxRuntimeInSeconds: 1800,
|
|
|
|
},
|
|
|
|
TrainingJobName: aws.String(jobName),
|
|
|
|
HyperParameters: map[string]string{
|
|
|
|
"sagemaker_region": "eu-west-1",
|
|
|
|
"slide_size": strconv.Itoa(slideSize),
|
|
|
|
"img_height": strconv.Itoa(imgHeight),
|
|
|
|
"img_width": strconv.Itoa(imgWidth),
|
|
|
|
},
|
|
|
|
InputDataConfig: []types.Channel{
|
|
|
|
{
|
|
|
|
ChannelName: aws.String("train"),
|
|
|
|
DataSource: &types.DataSource{
|
|
|
|
S3DataSource: &types.S3DataSource{
|
|
|
|
S3DataType: types.S3DataTypeS3Prefix,
|
|
|
|
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
|
|
|
|
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
|
2021-10-17 17:15:44 +00:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
2021-11-24 18:31:16 +00:00
|
|
|
}
|
|
|
|
if enableSpotTraining {
|
|
|
|
trainingJobInput.StoppingCondition.MaxWaitTimeInSeconds = aws.Int32(3600)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: check train data exist
|
|
|
|
jobOutput, err := client.CreateTrainingJob(
|
|
|
|
ctx,
|
|
|
|
&trainingJobInput,
|
2021-10-17 17:15:44 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to run sagemeker job: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for {
|
|
|
|
time.Sleep(30 * time.Second)
|
|
|
|
|
|
|
|
status, err := client.DescribeTrainingJob(
|
|
|
|
ctx,
|
|
|
|
&sagemaker.DescribeTrainingJobInput{
|
|
|
|
TrainingJobName: aws.String(jobName),
|
|
|
|
},
|
|
|
|
)
|
|
|
|
if err != nil {
|
2021-11-24 18:32:24 +00:00
|
|
|
l.Infof("unable to get status from ob %v: %v", jobOutput.TrainingJobArn, err)
|
2021-10-17 17:15:44 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
switch status.TrainingJobStatus {
|
|
|
|
case types.TrainingJobStatusInProgress:
|
2022-06-09 10:19:54 +00:00
|
|
|
l.Infof("job in progress: %v - %v - %v", status.TrainingJobStatus, status.SecondaryStatus, *status.SecondaryStatusTransitions[len(status.SecondaryStatusTransitions)-1].StatusMessage)
|
2021-10-17 17:15:44 +00:00
|
|
|
continue
|
|
|
|
case types.TrainingJobStatusFailed:
|
2021-11-24 18:32:24 +00:00
|
|
|
return fmt.Errorf("job %s finished with status %v", jobName, status.TrainingJobStatus)
|
2021-10-17 17:15:44 +00:00
|
|
|
default:
|
2021-11-24 18:32:24 +00:00
|
|
|
l.Infof("job %s finished with status %v", jobName, status.TrainingJobStatus)
|
2021-10-17 17:15:44 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *Training) GetTrainingOutput(ctx context.Context, jobName, outputFile string) error {
|
2021-11-24 18:10:52 +00:00
|
|
|
modelPath := fmt.Sprintf("output/%s/output/model.tar.gz", jobName)
|
|
|
|
err := models.DownloadArchiveToFile(ctx, t.bucketName, modelPath, outputFile)
|
2021-10-17 17:15:44 +00:00
|
|
|
if err != nil {
|
2021-11-24 18:10:52 +00:00
|
|
|
return fmt.Errorf("unable to download training model '%s' to '%s' file: %w", modelPath, outputFile, err)
|
2021-10-17 17:15:44 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func ListJob(ctx context.Context) error {
|
|
|
|
|
2021-11-24 18:10:52 +00:00
|
|
|
client := sagemaker.NewFromConfig(awsutils.MustLoadConfig())
|
2021-10-17 17:15:44 +00:00
|
|
|
jobs, err := client.ListTrainingJobs(ctx, &sagemaker.ListTrainingJobsInput{})
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("unable to list trainings jobs: %w", err)
|
|
|
|
}
|
|
|
|
for _, job := range jobs.TrainingJobSummaries {
|
|
|
|
fmt.Printf("%s\t\t%s\n", *job.TrainingJobName, job.TrainingJobStatus)
|
|
|
|
}
|
2022-06-09 10:19:54 +00:00
|
|
|
return nil
|
|
|
|
}
|