feat(train): add new command to interact with aws and train models
191
pkg/data/data.go
Normal file
@ -0,0 +1,191 @@
|
||||
package data
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/cyrilix/robocar-tools/record"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
var camSubDir = "cam"
|
||||
|
||||
func WriteArchive(basedir string, archiveName string, sliceSize int) error {
|
||||
content, err := BuildArchive(basedir, sliceSize)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
err = ioutil.WriteFile(archiveName, content, os.FileMode(0755))
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write archive content to disk: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func BuildArchive(basedir string, sliceSize int) ([]byte, error) {
|
||||
log.Printf("build zip archive from %s\n", basedir)
|
||||
dirItems, err := ioutil.ReadDir(basedir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to list directory in %v dir: %w", basedir, err)
|
||||
}
|
||||
|
||||
imgCams := make([]string, 0)
|
||||
records := make([]string, 0)
|
||||
|
||||
for _, dirItem := range dirItems {
|
||||
log.Debugf("process %v directory", dirItem)
|
||||
imgDir := path.Join(basedir, dirItem.Name(), camSubDir)
|
||||
imgs, err := ioutil.ReadDir(imgDir)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to list cam images in directory %v: %w", imgDir, err)
|
||||
}
|
||||
|
||||
for _, img := range imgs {
|
||||
idx, err := indexFromFile(img.Name())
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to find index in cam image name %v: %w", img.Name(), err)
|
||||
}
|
||||
log.Debugf("found image with index %v", idx)
|
||||
records = append(records, path.Join(basedir, dirItem.Name(), fmt.Sprintf(record.RecorNameFormat, idx)))
|
||||
imgCams = append(imgCams, path.Join(basedir, dirItem.Name(), camSubDir, img.Name()))
|
||||
}
|
||||
}
|
||||
|
||||
if sliceSize > 0 {
|
||||
imgCams, records, err = applySlice(imgCams, records, sliceSize)
|
||||
}
|
||||
|
||||
content, err := buildArchiveContent(imgCams, records)
|
||||
if err != nil {
|
||||
return nil , fmt.Errorf("unable to generate archive content: %w", err)
|
||||
}
|
||||
log.Printf("archive built\n")
|
||||
return content, nil
|
||||
}
|
||||
|
||||
func applySlice(imgCams []string, records []string, sliceSize int) ([]string, []string, error) {
|
||||
// Add sliceSize images shift
|
||||
i := imgCams[:len(imgCams)-sliceSize]
|
||||
r := records[sliceSize:]
|
||||
|
||||
return i, r, nil
|
||||
}
|
||||
|
||||
var indexRegexp *regexp.Regexp
|
||||
|
||||
func init() {
|
||||
re, err := regexp.Compile("image_array_(?P<idx>[0-9]+)\\.jpg$")
|
||||
if err != nil {
|
||||
log.Fatalf("unable to compile regex: %v", err)
|
||||
}
|
||||
indexRegexp = re
|
||||
}
|
||||
|
||||
func indexFromFile(fileName string) (string, error) {
|
||||
matches := findNamedMatches(indexRegexp, fileName)
|
||||
if matches["idx"] == "" {
|
||||
return "", fmt.Errorf("no index in filename")
|
||||
}
|
||||
return matches["idx"], nil
|
||||
}
|
||||
|
||||
func findNamedMatches(regex *regexp.Regexp, str string) map[string]string {
|
||||
match := regex.FindStringSubmatch(str)
|
||||
|
||||
results := map[string]string{}
|
||||
for i, name := range match {
|
||||
results[regex.SubexpNames()[i]] = name
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
func buildArchiveContent(imgFiles []string, recordFiles []string) ([]byte, error) {
|
||||
// Create a buffer to write our archive to.
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
// Create a new zip archive.
|
||||
w := zip.NewWriter(buf)
|
||||
|
||||
err := addJsonFiles(recordFiles, imgFiles, w)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to write json files in zip archive: %w", err)
|
||||
}
|
||||
|
||||
err = addCamImages(imgFiles, w)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to cam files in zip archive: %w", err)
|
||||
}
|
||||
|
||||
err = w.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to build archive: %w", err)
|
||||
}
|
||||
|
||||
content, err := ioutil.ReadAll(buf)
|
||||
return content, err
|
||||
}
|
||||
|
||||
func addCamImages(imgFiles []string, w *zip.Writer) error {
|
||||
for _, img := range imgFiles {
|
||||
imgContent, err := ioutil.ReadFile(img)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read img: %w", err)
|
||||
}
|
||||
|
||||
_, imgName := path.Split(img)
|
||||
err = addToArchive(w, imgName, imgContent)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new img entry in archive: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addJsonFiles(recordFiles []string, imgCam []string, w *zip.Writer) error {
|
||||
for idx, r := range recordFiles {
|
||||
content, err := ioutil.ReadFile(r)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to read json content: %w", err)
|
||||
}
|
||||
var rcd record.Record
|
||||
err = json.Unmarshal(content, &rcd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to unmarshal record: %w", err)
|
||||
}
|
||||
_, camName := path.Split((imgCam)[idx])
|
||||
rcd.CamImageArray = camName
|
||||
|
||||
recordBytes, err := json.Marshal(&rcd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to marshal %v record: %w", rcd, err)
|
||||
}
|
||||
|
||||
_, recordName := path.Split(r)
|
||||
err = addToArchive(w, recordName, recordBytes)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new record in archive: %w", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func addToArchive(w *zip.Writer, name string, content []byte) error {
|
||||
recordWriter, err := w.Create(name)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to create new entry %v in archive: %w", name, err)
|
||||
}
|
||||
|
||||
_, err = recordWriter.Write(content)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to add content in %v zip archive: %w", name, err)
|
||||
}
|
||||
return nil
|
||||
}
|
118
pkg/data/data_test.go
Normal file
@ -0,0 +1,118 @@
|
||||
package data
|
||||
|
||||
import (
|
||||
"archive/zip"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/cyrilix/robocar-tools/record"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBuildArchive(t *testing.T) {
|
||||
tmpDir, err := ioutil.TempDir("", "buildarchive")
|
||||
if err != nil {
|
||||
t.Fatalf("unable to make tmpdir: %v", err)
|
||||
}
|
||||
defer func() {
|
||||
err := os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
log.Warnf("unable to remove tempdir %v: %v", tmpDir, err)
|
||||
}
|
||||
}()
|
||||
|
||||
archive := path.Join(tmpDir, "train.zip")
|
||||
|
||||
expectedRecordFiles, expectedImgFiles := expectedFiles()
|
||||
|
||||
err = WriteArchive("testdata", archive, 0)
|
||||
if err != nil {
|
||||
t.Errorf("unable to build archive: %v", err)
|
||||
}
|
||||
|
||||
r, err := zip.OpenReader(archive)
|
||||
if err != nil {
|
||||
t.Errorf("unable to read archive, %v", err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if len(r.File) != len(expectedImgFiles)+len(expectedRecordFiles) {
|
||||
t.Errorf("bad number of files in archive: %v, wants %v", len(r.File), len(expectedImgFiles)+len(expectedRecordFiles))
|
||||
}
|
||||
|
||||
// Iterate through the files in the archive,
|
||||
// printing some of their contents.
|
||||
for _, f := range r.File {
|
||||
filename := f.Name
|
||||
if filename[len(filename)-4:] == "json" {
|
||||
expectedRecordFiles[filename] = true
|
||||
expectedtImgName := strings.Replace(filename, "record", "cam-image_array", 1)
|
||||
expectedtImgName = strings.Replace(expectedtImgName, "json", "jpg", 1)
|
||||
checkJsonContent(t, f, expectedtImgName)
|
||||
continue
|
||||
}
|
||||
if filename[len(filename)-3:] == "jpg" {
|
||||
expectedImgFiles[filename] = true
|
||||
continue
|
||||
}
|
||||
t.Errorf("unexpected file in archive: %v", filename)
|
||||
}
|
||||
|
||||
checkAllFilesAreFoundInArchive(expectedRecordFiles, t, expectedImgFiles)
|
||||
}
|
||||
|
||||
func checkAllFilesAreFoundInArchive(expectedRecordFiles map[string]bool, t *testing.T, expectedImgFiles map[string]bool) {
|
||||
for f, found := range expectedRecordFiles {
|
||||
if !found {
|
||||
t.Errorf("%v not found in archive", f)
|
||||
}
|
||||
}
|
||||
for f, found := range expectedImgFiles {
|
||||
if !found {
|
||||
t.Errorf("%v not found in archive", f)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func checkJsonContent(t *testing.T, f *zip.File, expectedCamImage string) {
|
||||
rc, err := f.Open()
|
||||
if err != nil {
|
||||
t.Errorf("unable to read file content of %v: %v", f.Name, err)
|
||||
}
|
||||
defer rc.Close()
|
||||
|
||||
content, err := ioutil.ReadAll(rc)
|
||||
if err != nil {
|
||||
t.Errorf("%v has invalid json content: %v", f.Name, err)
|
||||
}
|
||||
var rcd record.Record
|
||||
err = json.Unmarshal(content, &rcd)
|
||||
if err != nil {
|
||||
t.Errorf("unable to unmarshal json content of%v: %v", f.Name, err)
|
||||
}
|
||||
|
||||
if rcd.CamImageArray != expectedCamImage {
|
||||
t.Errorf("record %v: invalid image ref: %v, wants %v", f.Name, rcd.CamImageArray, expectedCamImage)
|
||||
}
|
||||
if rcd.UserAngle == 0. {
|
||||
t.Errorf("record %v: user angle has not been initialised", f.Name)
|
||||
}
|
||||
}
|
||||
|
||||
func expectedFiles() (map[string]bool, map[string]bool) {
|
||||
expectedRecordFiles := make(map[string]bool)
|
||||
expectedImgFiles := make(map[string]bool)
|
||||
for i := 1; i <= 8; i++ {
|
||||
expectedRecordFiles[fmt.Sprintf("record_%07d.json", i)] = false
|
||||
expectedImgFiles[fmt.Sprintf("cam-image_array_%07d.jpg", i)] = false
|
||||
}
|
||||
for i := 101; i <= 106; i++ {
|
||||
expectedRecordFiles[fmt.Sprintf("record_%07d.json", i)] = false
|
||||
expectedImgFiles[fmt.Sprintf("cam-image_array_%07d.jpg", i)] = false
|
||||
}
|
||||
return expectedRecordFiles, expectedImgFiles
|
||||
}
|
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000001.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000002.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000003.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000004.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000005.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000006.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000007.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-3/cam/cam-image_array_0000008.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
1
pkg/data/testdata/2020021819-3/record_0000001.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000001.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000002.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.045098066,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000002.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000003.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.045098066,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000003.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000004.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000004.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000005.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000005.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000006.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.043137312,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000006.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000007.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000007.jpg"}
|
1
pkg/data/testdata/2020021819-3/record_0000008.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.045098066,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000008.jpg"}
|
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000101.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000102.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000103.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000104.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000105.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
BIN
pkg/data/testdata/2020021819-4/cam/cam-image_array_0000106.jpg
vendored
Executable file
After Width: | Height: | Size: 2.5 KiB |
1
pkg/data/testdata/2020021819-4/record_0000101.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000101.jpg"}
|
1
pkg/data/testdata/2020021819-4/record_0000102.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.045098066,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000102.jpg"}
|
1
pkg/data/testdata/2020021819-4/record_0000103.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.045098066,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000103.jpg"}
|
1
pkg/data/testdata/2020021819-4/record_0000104.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000104.jpg"}
|
1
pkg/data/testdata/2020021819-4/record_0000105.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.04117644,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000105.jpg"}
|
1
pkg/data/testdata/2020021819-4/record_0000106.json
vendored
Executable file
@ -0,0 +1 @@
|
||||
{"user/angle":0.043137312,"cam/image_array":"/tmp/record//2020021819-3/cam/cam-image_array_0000106.jpg"}
|
47
pkg/train/archives.go
Normal file
@ -0,0 +1,47 @@
|
||||
package train
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3/types"
|
||||
"log"
|
||||
)
|
||||
|
||||
func ListArchives(ctx context.Context, bucket string) error {
|
||||
client := s3.NewFromConfig(mustLoadConfig())
|
||||
|
||||
prefix := aws.String("input/data/train/train.zip")
|
||||
objects, err := client.ListObjectsV2(ctx, &s3.ListObjectsV2Input{
|
||||
Bucket: aws.String(bucket),
|
||||
Prefix: prefix,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to list objects in bucket %v: %w", bucket, err)
|
||||
}
|
||||
fmt.Printf("objects: %v\n", objects)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t Training) UploadArchive(ctx context.Context, archive []byte) error {
|
||||
client := s3.NewFromConfig(t.config)
|
||||
key := aws.String("input/data/train/train.zip")
|
||||
|
||||
log.Printf("upload archive to bucket '%s/%s'\n", t.bucketName, *key)
|
||||
_, err := client.PutObject(
|
||||
ctx,
|
||||
&s3.PutObjectInput{
|
||||
ACL: types.ObjectCannedACLPrivate,
|
||||
Body: bytes.NewReader(archive),
|
||||
Bucket: aws.String(t.bucketName),
|
||||
Key: key,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to upload archive: %w", err)
|
||||
}
|
||||
log.Println("archive uploaded")
|
||||
return nil
|
||||
}
|
12
pkg/train/archives_test.go
Normal file
@ -0,0 +1,12 @@
|
||||
package train
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestListArchives(t *testing.T) {
|
||||
err := ListArchives()
|
||||
if err != nil {
|
||||
t.Errorf("unable to list buckets: %v", err)
|
||||
}
|
||||
}
|
22
pkg/train/config.go
Normal file
@ -0,0 +1,22 @@
|
||||
package train
|
||||
|
||||
import (
|
||||
"context"
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/config"
|
||||
"log"
|
||||
)
|
||||
|
||||
const (
|
||||
prefixInput = "input/data/train/"
|
||||
)
|
||||
|
||||
func mustLoadConfig() aws.Config {
|
||||
|
||||
c, err := config.LoadDefaultConfig(context.Background())
|
||||
if err != nil {
|
||||
log.Panicf("unable to load aws default config: %v", err)
|
||||
|
||||
}
|
||||
return c
|
||||
}
|
210
pkg/train/train.go
Normal file
@ -0,0 +1,210 @@
|
||||
package train
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
"github.com/aws/aws-sdk-go-v2/service/sagemaker"
|
||||
"github.com/aws/aws-sdk-go-v2/service/sagemaker/types"
|
||||
"github.com/cyrilix/robocar-tools/pkg/data"
|
||||
"io/fs"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"strconv"
|
||||
"time"
|
||||
)
|
||||
|
||||
func New(bucketName string, ociImage, roleArn string) *Training {
|
||||
return &Training{
|
||||
config: mustLoadConfig(),
|
||||
bucketName: bucketName,
|
||||
ociImage: ociImage,
|
||||
roleArn: roleArn,
|
||||
prefixInput: prefixInput,
|
||||
outputBucket: fmt.Sprintf("s3://%s/output", bucketName),
|
||||
}
|
||||
}
|
||||
|
||||
type Training struct {
|
||||
config aws.Config
|
||||
bucketName string
|
||||
ociImage string
|
||||
roleArn string
|
||||
prefixInput string
|
||||
outputBucket string
|
||||
}
|
||||
|
||||
func (t *Training) TrainDir(ctx context.Context, jobName, basedir string, sliceSize int, outputModelFile string) error {
|
||||
log.Printf("run training with data from %s\n", basedir)
|
||||
archive, err := data.BuildArchive(basedir, sliceSize)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to build data archive: %w", err)
|
||||
}
|
||||
log.Println("")
|
||||
|
||||
err = t.UploadArchive(ctx, archive)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to upload data arrchive: %w", err)
|
||||
}
|
||||
log.Println("")
|
||||
|
||||
err = t.runTraining(
|
||||
ctx,
|
||||
jobName,
|
||||
sliceSize,
|
||||
120,
|
||||
160,
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to run training: %w", err)
|
||||
}
|
||||
|
||||
err = t.GetTrainingOutput(ctx, jobName, outputModelFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to get output model file '%s': %w", outputModelFile, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func List(bucketName string) error {
|
||||
|
||||
pfxInput := prefixInput
|
||||
|
||||
// Create an Amazon S3 service client
|
||||
client := s3.NewFromConfig(mustLoadConfig())
|
||||
|
||||
// Get the first page of results for ListObjectsV2 for a bucket
|
||||
output, err := client.ListObjectsV2(context.TODO(), &s3.ListObjectsV2Input{
|
||||
Bucket: aws.String(bucketName),
|
||||
Prefix: &pfxInput,
|
||||
})
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
log.Println("first page results:")
|
||||
for _, object := range output.Contents {
|
||||
if *object.Key == pfxInput {
|
||||
continue
|
||||
}
|
||||
log.Printf("key=%s size=%d", aws.ToString(object.Key), object.Size)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (t *Training) runTraining(ctx context.Context, jobName string, slideSize int, imgHeight, imgWidth int) error {
|
||||
client := sagemaker.NewFromConfig(mustLoadConfig())
|
||||
log.Printf("Start training job '%s'\n", jobName)
|
||||
// TODO: check train data exist
|
||||
jobOutput, err := client.CreateTrainingJob(
|
||||
ctx,
|
||||
&sagemaker.CreateTrainingJobInput{
|
||||
EnableManagedSpotTraining: true,
|
||||
AlgorithmSpecification: &types.AlgorithmSpecification{
|
||||
TrainingInputMode: types.TrainingInputModeFile,
|
||||
TrainingImage: aws.String(t.ociImage),
|
||||
},
|
||||
OutputDataConfig: &types.OutputDataConfig{
|
||||
S3OutputPath: aws.String(t.outputBucket),
|
||||
},
|
||||
ResourceConfig: &types.ResourceConfig{
|
||||
InstanceCount: 1,
|
||||
InstanceType: types.TrainingInstanceTypeMlP2Xlarge,
|
||||
VolumeSizeInGB: 1,
|
||||
},
|
||||
RoleArn: aws.String(t.roleArn),
|
||||
StoppingCondition: &types.StoppingCondition{
|
||||
MaxRuntimeInSeconds: 1800,
|
||||
MaxWaitTimeInSeconds: aws.Int32(3600),
|
||||
},
|
||||
TrainingJobName: aws.String(jobName),
|
||||
HyperParameters: map[string]string{
|
||||
"sagemaker_region": "eu-west-1",
|
||||
"slide_size": strconv.Itoa(slideSize),
|
||||
"img_height": strconv.Itoa(imgHeight),
|
||||
"img_width": strconv.Itoa(imgWidth),
|
||||
},
|
||||
InputDataConfig: []types.Channel{
|
||||
{
|
||||
ChannelName: aws.String("train"),
|
||||
DataSource: &types.DataSource{
|
||||
S3DataSource: &types.S3DataSource{
|
||||
S3DataType: types.S3DataTypeS3Prefix,
|
||||
S3Uri: aws.String(fmt.Sprintf("s3://%s/%s", t.bucketName, t.prefixInput)),
|
||||
S3DataDistributionType: types.S3DataDistributionFullyReplicated,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to run sagemeker job: %w", err)
|
||||
}
|
||||
|
||||
for {
|
||||
time.Sleep(30 * time.Second)
|
||||
|
||||
status, err := client.DescribeTrainingJob(
|
||||
ctx,
|
||||
&sagemaker.DescribeTrainingJobInput{
|
||||
TrainingJobName: aws.String(jobName),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
log.Printf("unable to get status from ob %v: %v\n", jobOutput.TrainingJobArn, err)
|
||||
continue
|
||||
}
|
||||
switch status.TrainingJobStatus {
|
||||
case types.TrainingJobStatusInProgress:
|
||||
log.Printf("job in progress: %v - %v - %v\n", status.TrainingJobStatus, status.SecondaryStatus, *status.SecondaryStatusTransitions[len(status.SecondaryStatusTransitions) - 1].StatusMessage)
|
||||
continue
|
||||
case types.TrainingJobStatusFailed:
|
||||
return fmt.Errorf("job %s finished with status %v\n", jobName, status.TrainingJobStatus)
|
||||
default:
|
||||
log.Printf("job %s finished with status %v\n", jobName, status.TrainingJobStatus)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *Training) GetTrainingOutput(ctx context.Context, jobName, outputFile string) error {
|
||||
// Create an Amazon S3 service client
|
||||
client := s3.NewFromConfig(t.config)
|
||||
|
||||
// Get the first page of results for ListObjectsV2 for a bucket
|
||||
output, err := client.GetObject(
|
||||
ctx,
|
||||
&s3.GetObjectInput{
|
||||
Bucket: aws.String(t.bucketName),
|
||||
Key: aws.String(fmt.Sprintf("output/%s/model.tar.gz", jobName)),
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to get resource: %w", err)
|
||||
}
|
||||
content, err := ioutil.ReadAll(output.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable read output content: %w", err)
|
||||
}
|
||||
err = ioutil.WriteFile(outputFile, content, fs.ModePerm)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write content to '%v': %w", outputFile, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ListJob(ctx context.Context) error {
|
||||
|
||||
client := sagemaker.NewFromConfig(mustLoadConfig())
|
||||
jobs, err := client.ListTrainingJobs(ctx, &sagemaker.ListTrainingJobsInput{})
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to list trainings jobs: %w", err)
|
||||
}
|
||||
for _, job := range jobs.TrainingJobSummaries {
|
||||
fmt.Printf("%s\t\t%s\n", *job.TrainingJobName, job.TrainingJobStatus)
|
||||
}
|
||||
return nil
|
||||
}
|
22
pkg/train/train_test.go
Normal file
@ -0,0 +1,22 @@
|
||||
package train
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestList(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "default",
|
||||
wantErr: false,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if err := List(); (err != nil) != tt.wantErr {
|
||||
t.Errorf("List() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|