I'm going to start with showing the code and then what I'm trying to do, code:
package main
import (
"fmt"
"os"
"path/filepath"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
)
var (
// empty strings for security reasons
Bucket = "" // Download from this bucket
Prefix = "" // Using this key prefix
LocalDirectory = "s3logs" // Into this directory
)
func main() {
sess := session.New()
client := s3.New(sess, &aws.Config{Region: aws.String("us-west-1")})
params := &s3.ListObjectsInput{Bucket: &Bucket, Prefix: &Prefix}
manager := s3manager.NewDownloaderWithClient(client, func(d *s3manager.Downloader) {
d.PartSize = 64 * 1024 * 1024 // 64MB per part
d.Concurrency = 8
}) // works
//manager := s3manager.NewDownloaderWithClient(client) //works
d := downloader{bucket: Bucket, dir: LocalDirectory, Downloader: manager}
client.ListObjectsPages(params, d.eachPage)
}
type downloader struct {
*s3manager.Downloader
bucket, dir string
}
func (d *downloader) eachPage(page *s3.ListObjectsOutput, more bool) bool {
for _, obj := range page.Contents {
d.downloadToFile(*obj.Key)
}
return true
}
func (d *downloader) downloadToFile(key string) {
// Create the directories in the path
file := filepath.Join(d.dir, key)
if err := os.MkdirAll(filepath.Dir(file), 0775); err != nil {
panic(err)
}
fmt.Printf("Downloading " + key)
// Setup the local file
fd, err := os.Create(file)
if err != nil {
panic(err)
}
defer fd.Close()
// Download the file using the AWS SDK
fmt.Printf("Downloading s3://%s/%s to %s...\n", d.bucket, key, file)
params := &s3.GetObjectInput{Bucket: &d.bucket, Key: &key}
_, e := d.Download(fd, params)
if e != nil {
panic(e)
}
}
I'm trying to download the log files from a particular bucket and eventually many buckets. I need the download to be as fast as possible because. There is lots of data. My question is what is the most effective way to download huge amounts of data quickly? The whole process is nil if those logs can't be downloaded at a reasonable speed. Is there a faster way, it's already concurrent according to amazons doc? Any ideas? Also, i've noticed a curious thing. It doesn't matter if I set the Concurrency to 1, 4, or 20. Everything is still downloading at ~.70 - ~/.80 gb / min
Related
I have a file with a list of 600 regex patterns that most be performed in order to find a specific id for a website.
Example:
regex/www\.effectiveperformanceformat\.com/5
regex/bam-cell\.nr-data\.net/5
regex/advgoogle\.com/5
regex/googleapi\.club/5
regex/doubleclickbygoogle\.com/5
regex/googlerank\.info/5
regex/google-pr7\.de/5
regex/usemarketings\.com/5
regex/google-rank\.org/5
regex/googleanalytcs\.com/5
regex/xml\.trafficmoose\.com/5
regex/265\.com/5
regex/app-measurement\.com/5
regex/loftsbaacad\.com/5
regex/toldmeflex\.com/5
regex/r\.baresi\.xyz/5
regex/molodgytot\.biz/5
regex/ec\.walkme\.com/5
regex/px\.ads\.linkedin\.com/5
regex/hinisanex\.biz/5
regex/buysellads\.com/5
regex/buysellads\.net/5
regex/servedby-buysellads\.com/5
regex/carbonads\.(net|com)/5
regex/oulddev\.biz/5
regex/click\.hoolig\.app/5
regex/engine\.blacraft\.com/5
regex/mc\.yandex\.ru/5
regex/ads\.gaming1\.com/5
regex/adform\.net/5
regex/luzulabeguile\.com/5
regex/ficanportio\.biz/5
regex/hidelen\.com/5
regex/earchmess\.fun/5
regex/acrvclk\.com/5
regex/track\.wg-aff\.com/5
regex/thumb\.tapecontent\.net/5
regex/betgorebysson\.club/5
regex/in-page-push\.com/5
regex/itphanpytor\.club/5
regex/mktoresp\.com/5
regex/xid\.i-mobile\.co\.jp/5
regex/ads\.tremorhub\.com/5
So far what i'm using is something like this
for _, line := range file {
l := line
data := strings.Split(l, "/")
if data[0] == "regex" {
match, _ := regexp.MatchString(``+data[1]+``, website)
if match {
id, _ = strconv.Atoi(data[2])
}
}
}
This is working, but i wonder if there is a more optimized way to do this.
Because, if the website match with the regex on the top, great, but if not, i need to intenered the loop over and over till find it.
Anyone can help me to improve this?
Best regards
In order to reduce the time you can cache the regexp.
package main
import (
"bufio"
"bytes"
"fmt"
csvutils "github.com/alessiosavi/GoGPUtils/csv"
"log"
"os"
"regexp"
"strconv"
"strings"
"time"
)
func main() {
now := time.Now()
Precomputed("www.google.it")
fmt.Println(time.Since(now))
now = time.Now()
NonPrecomputed("www.google.it")
fmt.Println(time.Since(now))
}
func NonPrecomputed(website string) int {
for _, line := range cachedLines {
l := line
data := strings.Split(l, "/")
if data[0] == "regex" {
match, _ := regexp.MatchString(``+data[1]+``, website)
if match {
id, _ := strconv.Atoi(data[2])
return id
}
}
}
return -1
}
func Precomputed(site string) int {
for regex, id := range rawRegex {
if ok := regex.MatchString(site); ok {
return id
}
}
return -1
}
var rawRegex map[*regexp.Regexp]int = make(map[*regexp.Regexp]int)
var cachedLines []string
var sites []string
func init() {
now := time.Now()
file, err := os.ReadFile("regex.txt")
if err != nil {
panic(err)
}
scanner := bufio.NewScanner(bytes.NewReader(file))
for scanner.Scan() {
txt := scanner.Text()
cachedLines = append(cachedLines, txt)
split := strings.Split(txt, "/")
if len(split) == 3 {
compile, err := regexp.Compile(split[1])
if err != nil {
panic(err)
}
if rawRegex[compile], err = strconv.Atoi(split[2]); err != nil {
panic(err)
}
}
}
file, err = os.ReadFile("top500Domains.csv")
if err != nil {
panic(err)
}
_, csvData, err := csvutils.ReadCSV(file, ',')
if err != nil {
panic(err)
}
for _, line := range csvData {
sites = append(sites, line[1])
}
log.Println("Init took:", time.Since(now))
}
The init method take care of regexp cache. It will load all the regexp in a map with the relative index (it will load the test data too just for the benchmark).
Then you have 2 method:
Precomputed: use the map of cached regexp
NonPrecomputed: the copy->paste of your snippet
As you can see where the NonPrecomputed method is able to perform 63 execution, the Precomputed is able to perform 10000 execution.
As you can see the NonPrecomputed method allocate ~67 MB when the Precomputed method have no allocation (due to the initial cache)
C:\opt\SP\Workspace\Go\Temp>go test -bench=. -benchmem -benchtime=10s
2022/11/03 00:45:35 Init took: 10.8397ms
goos: windows
goarch: amd64
pkg: Temp
cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 # 3.00GHz
Benchmark_Precomputed-8 10000 1113887 ns/op 0 B/op 0 allocs/op
Benchmark_NonPrecomputed-8 63 298434740 ns/op 65782238 B/op 484595 allocs/op
PASS
ok Temp 41.548s
Thanks in advance :) . I'm using the following code to get metadata from an s3 object after listing all the object in a bucket . But I don't know why it gives the error undefined: s3.HeadObject when running go run listObjects.go -bucket xxxx -prefix xxxx
I tried two solutions: giving the client as the one created from the config and from the context as in this link appears [1]. BUt both gave the same error. Can you give me any clue?
package main
import (
"context"
"flag"
"fmt"
"log"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
)
var (
bucketName string
objectPrefix string
objectDelimiter string
maxKeys int
)
func init() {
flag.StringVar(&bucketName, "bucket", "", "The `name` of the S3 bucket to list objects from.")
flag.StringVar(&objectPrefix, "prefix", "", "The optional `object prefix` of the S3 Object keys to list.")
flag.StringVar(&objectDelimiter, "delimiter", "",
"The optional `object key delimiter` used by S3 List objects to group object keys.")
flag.IntVar(&maxKeys, "max-keys", 0,
"The maximum number of `keys per page` to retrieve at once.")
}
// Lists all objects in a bucket using pagination
func main() {
flag.Parse()
if len(bucketName) == 0 {
flag.PrintDefaults()
log.Fatalf("invalid parameters, bucket name required")
}
// Load the SDK's configuration from environment and shared config, and
// create the client with this.
cfg, err := config.LoadDefaultConfig(context.TODO())
if err != nil {
log.Fatalf("failed to load SDK configuration, %v", err)
}
client := s3.NewFromConfig(cfg)
// Set the parameters based on the CLI flag inputs.
params := &s3.ListObjectsV2Input{
Bucket: &bucketName,
}
if len(objectPrefix) != 0 {
params.Prefix = &objectPrefix
}
if len(objectDelimiter) != 0 {
params.Delimiter = &objectDelimiter
}
// Create the Paginator for the ListObjectsV2 operation.
p := s3.NewListObjectsV2Paginator(client, params, func(o *s3.ListObjectsV2PaginatorOptions) {
if v := int32(maxKeys); v != 0 {
o.Limit = v
}
})
// Iterate through the S3 object pages, printing each object returned.
var i int
log.Println("Objects:")
for p.HasMorePages() {
i++
// Next Page takes a new context for each page retrieval. This is where
// you could add timeouts or deadlines.
page, err := p.NextPage(context.TODO())
if err != nil {
log.Fatalf("failed to get page %v, %v", i, err)
}
// Log the objects found
// Headobject function is called
for _, obj := range page.Contents {
input := &s3.HeadObjectInput{
Bucket: &bucketName,
Key: obj.Key,
}
result, err := &s3.HeadObject(client, input)
if err != nil {
panic(err)
}
fmt.Println("Object:", *obj.Key)
}
}
}
./listObjects.go:86:20: undefined: s3.HeadObject
1
Doing the headObject as an auxiliary method works
package main
import (
"context"
"flag"
"fmt"
"log"
"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/s3"
)
var (
bucketName string
objectPrefix string
objectDelimiter string
maxKeys int
)
func init() {
flag.StringVar(&bucketName, "bucket", "", "The `name` of the S3 bucket to list objects from.")
flag.StringVar(&objectPrefix, "prefix", "", "The optional `object prefix` of the S3 Object keys to list.")
flag.StringVar(&objectDelimiter, "delimiter", "",
"The optional `object key delimiter` used by S3 List objects to group object keys.")
flag.IntVar(&maxKeys, "max-keys", 0,
"The maximum number of `keys per page` to retrieve at once.")
}
// Lists all objects in a bucket using pagination
func main() {
flag.Parse()
if len(bucketName) == 0 {
flag.PrintDefaults()
log.Fatalf("invalid parameters, bucket name required")
}
// Load the SDK's configuration from environment and shared config, and
// create the client with this.
cfg, err := config.LoadDefaultConfig(context.TODO())
if err != nil {
log.Fatalf("failed to load SDK configuration, %v", err)
}
client := s3.NewFromConfig(cfg)
// Set the parameters based on the CLI flag inputs.
params := &s3.ListObjectsV2Input{
Bucket: &bucketName,
}
if len(objectPrefix) != 0 {
params.Prefix = &objectPrefix
}
if len(objectDelimiter) != 0 {
params.Delimiter = &objectDelimiter
}
// Create the Paginator for the ListObjectsV2 operation.
p := s3.NewListObjectsV2Paginator(client, params, func(o *s3.ListObjectsV2PaginatorOptions) {
if v := int32(maxKeys); v != 0 {
o.Limit = v
}
})
// Iterate through the S3 object pages, printing each object returned.
var i int
log.Println("Objects:")
for p.HasMorePages() {
i++
// Next Page takes a new context for each page retrieval. This is where
// you could add timeouts or deadlines.
page, err := p.NextPage(context.TODO())
if err != nil {
log.Fatalf("failed to get page %v, %v", i, err)
}
// Log the objects found
// Headobject function is called
for _, obj := range page.Contents {
fmt.Println("Object:", *obj.Key)
OpHeadObject(client, bucketName, *obj.Key)
}
}
}
func OpHeadObject(sess *s3.Client, bucketName, objectName string) {
input := &s3.HeadObjectInput{
Bucket: &bucketName,
Key: &objectName,
}
resp, err := sess.HeadObject(context.TODO(), input)
if err != nil {
panic(err)
}
fmt.Println(resp.StorageClass) // that you want.
}
I have the following struct:
type ProcessedRecords struct {
CustIndividualID string `json:"individual id"`
Household string `json:"Household"`
}
And I have a slice of many structs that share this value. I'm trying to submit them using the PutRecordBatch operation from the AWS SDK:
package main
import (
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/firehose"
)
type ProcessedRecords struct {
CustIndividualID string `json:"individual id"`
Household string `json:"Household"`
}
func main() {
submitToFirehose(recordList)
}
func submitToFirehose(records []ProcessedRecords) {
streamName := "processed-stream"
sess := session.Must(session.NewSession())
// Create a Firehose client with additional configuration
firehoseService := firehose.New(sess, aws.NewConfig().WithRegion("us-east-1"))
recordsBatchInput := &firehose.PutRecordBatchInput{}
recordsBatchInput = recordsBatchInput.SetDeliveryStreamName(streamName)
recordsInput := []*firehose.Record{}
for i := 0; i < len(records); i++ {
if len(recordsInput) == 500 {
recordsBatchInput = recordsBatchInput.SetRecords(recordsInput)
resp, err := firehoseService.PutRecordBatch(recordsBatchInput)
if err != nil {
fmt.Printf("PutRecordBatch err: %v\n", err)
} else {
fmt.Printf("FailedPuts: %v\n", *resp.FailedPutCount)
}
recordsInput = []*firehose.Record{}
}
b, err := json.Marshal(records[i])
if err != nil {
log.Printf("Error: %v", err)
}
record := &firehose.Record{Data: b}
recordsInput = append(recordsInput, record)
}
}
This seems to work and it would appear that my Glue backend is setup correctly, however CustIndividualID is not being written to S3. I suspect it's because it's reading the json:"individual id" as the column name and not the CustIndividualID.
this is a problem because glue tables can't have spaces in the column name. What am I doing wrong?
Based on the example given in the link blow on API Operation Pagination without Callbacks
https://aws.amazon.com/blogs/developer/context-pattern-added-to-the-aws-sdk-for-go/
I am trying to list all the Backups in dynamodb. But it seems like pagination is not working and it is just retrieving first page and not going to next page
package main
import (
"context"
"fmt"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/dynamodb"
)
func main() {
sess, sessErr := session.NewSession()
if sessErr != nil {
fmt.Println(sessErr)
fmt.Println("Cound not initilize session..returning..")
return
}
// Create DynamoDB client
dynamodbSvc := dynamodb.New(sess)
params := dynamodb.ListBackupsInput{}
ctx := context.Background()
p := request.Pagination{
NewRequest: func() (*request.Request, error) {
req, _ := dynamodbSvc.ListBackupsRequest(¶ms)
req.SetContext(ctx)
return req, nil
},
}
for p.Next() {
page := p.Page().(*dynamodb.ListBackupsOutput)
fmt.Println("Received", len(page.BackupSummaries), "objects in page")
for _, obj := range page.BackupSummaries {
fmt.Println(aws.StringValue(obj.BackupName))
}
}
//return p.Err()
} //end of main
Its a bit late but I'll just put it here in case I can help somebody.
Example:
var exclusiveStartARN *string
var backups []*dynamodb.BackupSummary
for {
backup, err := svc.ListBackups(&dynamodb.ListBackupsInput{
ExclusiveStartBackupArn:exclusiveStartARN,
})
if err != nil {
fmt.Println(err)
os.Exit(1)
}
backups = append(backups, backup.BackupSummaries...)
if backup.LastEvaluatedBackupArn != nil {
exclusiveStartARN = backup.LastEvaluatedBackupArn
//max 5 times a second so we dont hit the limit
time.Sleep(200 * time.Millisecond)
continue
}
break
}
fmt.Println(len(backups))
Explaination:
The way that pagination is done is via ExclusiveStartBackupArn in the ListBackupsRequest. The ListBackupsResponse returns LastEvaluatedBackupArn if there are more pages, or nil if its the last/only page.
It could be that you're smashing into the API a bit with your usage
You can call ListBackups a maximum of 5 times per second.
What is the value of p.HasNextPage() in your p.Next() loop?
I'm using the script from this post to download files from s3.
Everything works fine except the downloaded files are all 0B.
My script are basically the same as the script above.
I use ~/.aws/credentials to set my keys and set region in the script.
Here is my script:
package main
import (
"fmt"
"os"
"path/filepath"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
)
var (
Bucket = "logs" // Download from this bucket
Prefix = "local-dir/my_log/20150611/20150611" // Using this key prefix
LocalDirectory = "s3logs" // Into this directory
)
func main() {
manager := s3manager.NewDownloader(nil)
d := downloader{bucket: Bucket, dir: LocalDirectory, Downloader: manager}
client := s3.New(&aws.Config{Region: "ap-northeast-1"})
params := &s3.ListObjectsInput{Bucket: &Bucket, Prefix: &Prefix}
client.ListObjectsPages(params, d.eachPage)
}
type downloader struct {
*s3manager.Downloader
bucket, dir string
}
func (d *downloader) eachPage(page *s3.ListObjectsOutput, more bool) bool {
for _, obj := range page.Contents {
d.downloadToFile(*obj.Key)
}
return true
}
func (d *downloader) downloadToFile(key string) {
// Create the directories in the path
file := filepath.Join(d.dir, key)
if err := os.MkdirAll(filepath.Dir(file), 0775); err != nil {
panic(err)
}
fmt.Printf("Downloading " + key)
// Setup the local file
fd, err := os.Create(file)
if err != nil {
panic(err)
}
defer fd.Close()
// Download the file using the AWS SDK
fmt.Printf("Downloading s3://%s/%s to %s...\n", d.bucket, key, file)
params := &s3.GetObjectInput{Bucket: &d.bucket, Key: &key}
d.Download(fd, params)
}
The script listed the objects in the bucket very well, but did not download files into my local file system. And the script did not run into any exception.
Any idea why?
Thanks!