Firehose record format conversion partitions - file-conversion

I tried to to use the new firehose feature "record format conversion" to save my events as parquet files for athena or hive aggregations. You have to select the table from your glue catalog, but firehose ignores the defined partions, and instead saves the files in the structure YYYY/MM/DD/HH/. The data also is missing the defined partition columns. This would be ok, if it has used it for partitioning.
Is there a API configuration, or something else, to force to use the table partitioning?

I have exact the same issue, even with same partitioning
So you have to use AWS lambda to achieve what you want
one to move files generated by Firehose to bucket that is used by Athena.
another one to trigger refresh Athena table, as it will not see new folders
(I don't put all triggers, but this should be just a call 'MSCK REPAIR TABLE your_table_name;')
For the first one I choose NodeJs, as this is really simple and really fast.
~ 3 seconds to move 120MB file with minimum AWS allowed 128MB RAM memory allocation(files generated by Firehose will be approximate 64MB max)
Node js project structure
package.json
{
"name": "your.project",
"version": "1.0.0",
"description": "Copy generated partitioned files by Firehose to valid partitioned files for Athena",
"main": "index.js",
"dependencies": {
"async": "^2.6.1"
}
}
And index.js
const aws = require('aws-sdk');
const async = require('async');
const s3 = new aws.S3();
const dstBucket = 'PUT_YOUR_BUCKET_NAME_HERE';
var util = require('util');
exports.handler = (event, context, callback) => {
const srcBucket = event.Records[0].s3.bucket.name;
const srcKey = event.Records[0].s3.object.key;
const split = srcKey.split('/');
const dstKey = `event_year=${split[0]}/event_month=${split[1]}/event_day=${split[2]}/event_hour=${split[3]}/${split[4]}`;
console.log("Reading options from event:\n", util.inspect(event, {depth: 10}));
async.waterfall([
function copy(next) {
s3.copyObject({
Bucket: dstBucket,
CopySource: `${srcBucket}/${srcKey}`,
Key: dstKey
}, next);
},
function deleteOriginal(copyResult, next) {
s3.deleteObject({
Bucket: srcBucket,
Key: srcKey
}, next);
}
], function (err) {
if (err) {
console.error(`Failed: ${srcBucket}/${srcKey} => ${dstBucket}/${dstKey} to move FireHose partitioned object to Athena partitioned object. Error: ${err}`);
} else {
console.log(`Success: ${srcBucket}/${srcKey} => ${dstBucket}/${dstKey} moved FireHose partitioned object to Athena partitioned object`);
}
callback(null, 'move success');
}
);
};
Just update some data to be valid for your case.
And one more issue that I got, is when build a project with
npm install
and zip it, it was something not correct in AWS unzip, so I have to update path to my index.js.
And this works.
You can also find this line
console.log("Reading options from event:\n", util.inspect(event, {depth: 10}));
It can be removed, but greatly helps to understand details of processing object

Related

S3 Create Objects Triggers in LAMBDA are not pulling in the S3 Data

THE ISSUE
I'm trying to load data from S3 Bucket using Lambda (NODEJS). I've looked at various topics here where similar questions were posted and have tried to combine reading the data from many of them (here and here and a few more as well).
The setup is S3(Create Events)>SNS>SQS>LAMBDA. The invocation from S3 all the way to LAMBDA is working perfectly. The code below and the Cloudwatch logs below indicates successful invocation of lambda for every PUT event in S3.
I have the IAM roles configured to read from SQS and S3 for LAMBDA
I am NOT using SSE on the S3 buckets or objects
But regardless of the approaches I have put (both are in the code below) -> the Lambda is not outputting to console the contents of the CSV File -> You can see the final console logs are not being outputted where the code tries to read the S3 data. Has something else changed over the years that needs updating in the S3 API calls?
CODE
const readline = require('readline');
const AWS = require("aws-sdk");
const s3 = new AWS.S3({apiVersion: '2006-03-01'});
exports.handler = async (event) => {
console.log('###### Received event:', JSON.stringify(event, null, 2));
for (const record of event.Records) {
try {
const body = JSON.parse((record.body));
// console.log(`###### Record BODY ${JSON.stringify(body, null, 2)}`);
const message = JSON.parse(body.Message);
//console.log(`##### Record BODY MESSAGE ${JSON.stringify(message, null, 2)}`);
const bucket = message.Records[0].s3.bucket.name;
const key = message.Records[0].s3.object.key;
console.log(`###### Record BODY bucket ${bucket}`);
console.log(`###### Record BODY key ${key}`);
const params = {
Bucket: bucket, Key: key
};
console.log(`###### THE PARAMS ${JSON.stringify(params, null, 2)}`);
s3.getObject(params , function (err, data) {
if (err) {
console.log(`###### ERR ${err}`);
throw err;
} else {
console.log(`###### ${data.Body.toString()}`);
}
})
const rl = readline.createInterface({
input: s3.getObject(params).createReadStream()
});
rl.on('line', function(line) {
console.log(`####### RECORD DATA LINE ${line}`);
})
.on('close', function() {
console.log(`####### RECORD DATA CLOSED!!!!!`);
});
} catch (e ) {
console.log(`###### Error ${JSON.stringify(e, null, 2)}`);
}
}
return `Successfully processed ${event.Records.length} messages.`;
};
IAM ROLES
For the sake of brevity I've removed the SQS specific details which work fine. I've given ALL the accesses to S3 but just listing getObject here again for brevity.
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor0",
"Effect": "Allow",
"Action": [
..
"s3:GetObject",
..
],
"Resource": "arn:aws:s3:::my-REDACTED-bucket"
}
]
}
CLOUDWATCH LOGS
2022-08-31T12:00:07.897Z 0191e48d-f3ac-5af8-9334-bf04bcd815d8 INFO ###### Record BODY bucket my-REDACTED-bucket
2022-08-31T12:00:07.897Z 0191e48d-f3ac-5af8-9334-bf04bcd815d8 INFO ###### Record BODY key test.csv
2022-08-31T12:00:07.897Z 0191e48d-f3ac-5af8-9334-bf04bcd815d8 INFO THE PARAMS
{
"Bucket": "my-REDACTED-bucket",
"Key": "test.csv"
}
END RequestId: 0191e48d-f3ac-5af8-9334-bf04bcd815d8
The basic problem is in your async code. Your code is exiting before anything happens. You need to modify the code to await the asynchronous results.
Use:
const data = await s3.getObject(params).promise();
If you prefer to use the streamed, readline approach then see this answer.

How to return an entire Datastore table by name using Node.js on a Google Cloud Function

I want to retrieve a table (with all rows) by name. I want to HTTP request using something like this on the body {"table": user}.
Tried this code without success:
'use strict';
const {Datastore} = require('#google-cloud/datastore');
// Instantiates a client
const datastore = new Datastore();
exports.getUsers = (req, res) => {
//Get List
const query = this.datastore.createQuery('users');
this.datastore.runQuery(query).then(results => {
const customers = results[0];
console.log('User:');
customers.forEach(customer => {
const cusKey = customer[this.datastore.KEY];
console.log(cusKey.id);
console.log(customer);
});
})
.catch(err => { console.error('ERROR:', err); });
}
Google Datastore is a NoSQL database that is working with entities and not tables. What you want is to load all the "records" which are "key identifiers" in Datastore and all their "properties", which is the "columns" that you see in the Console. But you want to load them based the "Kind" name which is the "table" that you are referring to.
Here is a solution on how to retrieve all the key identifiers and their properties from Datastore, using HTTP trigger Cloud Function running in Node.js 8 environment.
Create a Google Cloud Function and choose the trigger to HTTP.
Choose the runtime to be Node.js 8
In index.js replace all the code with this GitHub code.
In package.json add:
{
"name": "sample-http",
"version": "0.0.1",
"dependencies": {
"#google-cloud/datastore": "^3.1.2"
}
}
Under Function to execute add loadDataFromDatastore, since this is the name of the function that we want to execute.
NOTE: This will log all the loaded records into the Stackdriver logs
of the Cloud Function. The response for each record is a JSON,
therefore you will have to convert the response to a JSON object to
get the data you want. Get the idea and modify the code accordingly.

How to invoke AWS CLI command from CodePipeline?

I want to copy artifacts from S3 bucket in Account 1 to S3 bucket in Account 2. Though I was able to setup replication but I want to know whether there is a way to invoke AWS CLI command from within a pipeline.
Can it be invoked using Lambda function? If yes, any small sample script will be helpful.
Yes, you can add a Lambda Invoke action to your pipeline to call the copyobject API. The core part of the Lambda function is as follow.
exports.copyRepoToProdS3 = (event, context) => {
const jobId = event['CodePipeline.job'].id
const s3Location = event['CodePipeline.job'].data.inputArtifacts[0].location.s3Location
const cpParams = JSON.parse(event['CodePipeline.job'].data.actionConfiguration.configuration.UserParameters)
let promises = []
for (let bucket of prodBuckets) {
let params = {
Bucket: bucket,
CopySource: s3Location['bucketName'] + '/' + s3Location['objectKey'],
Key: cpParams['S3ObjectKey']
}
promises.push(s3.copyObject(params).promise())
}
return Promise.all(promises)
.then((data) => {
console.log('Successfully copied repo to buckets!')
}).catch((error) => {
console.log('Failed to copy repo to buckets!', error)
})
}
And more detailed steps to add roles and report processing result to CodePipeline can be find at the following link. https://medium.com/#codershunshun/how-to-invoke-aws-lambda-in-codepipeline-d7c77457af95

AWS S3 copy to bucket from remote location

There is a large dataset on a public server (~0.5TB, multi-part here), which I would like to copy into my own s3 buckets. It seems like aws s3 cp is only for local files or files based in S3 buckets?
How can I copy that file (either single or multi-part) into S3? Can I use the AWS CLI or do i need to something else?
There's no way to upload it directly to S3 from the remote location. But you can stream the contents of the remote files to your machine and then up to S3. This means that you will have downloaded the entire 0.5TB of data, but your computer will only ever hold a tiny fraction of that data in memory at a time (it will not be persisted to disc either). Here is a simple implementation in javascript:
const request = require('request')
const async = require('async')
const AWS = require('aws-sdk')
const s3 = new AWS.S3()
const Bucket = 'nyu_depth_v2'
const baseUrl = 'http://horatio.cs.nyu.edu/mit/silberman/nyu_depth_v2/'
const parallelLimit = 5
const parts = [
'basements.zip',
'bathrooms_part1.zip',
'bathrooms_part2.zip',
'bathrooms_part3.zip',
'bathrooms_part4.zip',
'bedrooms_part1.zip',
'bedrooms_part2.zip',
'bedrooms_part3.zip',
'bedrooms_part4.zip',
'bedrooms_part5.zip',
'bedrooms_part6.zip',
'bedrooms_part7.zip',
'bookstore_part1.zip',
'bookstore_part2.zip',
'bookstore_part3.zip',
'cafe.zip',
'classrooms.zip',
'dining_rooms_part1.zip',
'dining_rooms_part2.zip',
'furniture_stores.zip',
'home_offices.zip',
'kitchens_part1.zip',
'kitchens_part2.zip',
'kitchens_part3.zip',
'libraries.zip',
'living_rooms_part1.zip',
'living_rooms_part2.zip',
'living_rooms_part3.zip',
'living_rooms_part4.zip',
'misc_part1.zip',
'misc_part2.zip',
'office_kitchens.zip',
'offices_part1.zip',
'offices_part2.zip',
'playrooms.zip',
'reception_rooms.zip',
'studies.zip',
'study_rooms.zip'
]
async.eachLimit(parts, parallelLimit, (Key, cb) => {
s3.upload({
Key,
Bucket,
Body: request(baseUrl + Key)
}, cb)
}, (err) => {
if (err) console.error(err)
else console.log('Done')
})

Copying one table to another in DynamoDB

What's the best way to identically copy one table over to a new one in DynamoDB?
(I'm not worried about atomicity).
Create a backup(backups option) and restore the table with a new table name. That would get all the data into the new table.
Note: Takes considerable amount of time depending on the table size
I just used the python script, dynamodb-copy-table, making sure my credentials were in some environment variables (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY), and it worked flawlessly. It even created the destination table for me.
python dynamodb-copy-table.py src_table dst_table
The default region is us-west-2, change it with the AWS_DEFAULT_REGION env variable.
AWS Pipeline provides a template which can be used for this purpose: "CrossRegion DynamoDB Copy"
See: http://docs.aws.amazon.com/datapipeline/latest/DeveloperGuide/dp-crossregion-ddb-create.html
The result is a simple pipeline that looks like:
Although it's called CrossRegion you can easily use it for the same region as long the destination table name is different (Remember that table names are unique per account and region)
You can use Scan to read the data and save it to the new table.
On the AWS forums a guy from the AWS team posted another approach using EMR: How Do I Duplicate a Table?
Here's one solution to copy all items from one table to another, just using shell scripting, the AWS CLI and jq. Will work OK for smallish tables.
# exit on error
set -eo pipefail
# tables
TABLE_FROM=<table>
TABLE_TO=<table>
# read
aws dynamodb scan \
--table-name "$TABLE_FROM" \
--output json \
| jq "{ \"$TABLE_TO\": [ .Items[] | { PutRequest: { Item: . } } ] }" \
> "$TABLE_TO-payload.json"
# write
aws dynamodb batch-write-item --request-items file://"$TABLE_TO-payload.json"
# clean up
rm "$TABLE_TO-payload.json"
If you both tables to be identical, you'd want to delete all items in TABLE_TO first.
DynamoDB now supports importing from S3.
https://aws.amazon.com/blogs/database/amazon-dynamodb-can-now-import-amazon-s3-data-into-a-new-table/
So, probably in almost all use cases, the easiest and cheapest way to replicate a table is
Use "Export to S3" feature to dump entire table into S3. Since this uses backup to generate the dump, table's throughput is not affected, and is very fast as well. You need to have backups (PITR) enabled. See https://aws.amazon.com/blogs/aws/new-export-amazon-dynamodb-table-data-to-data-lake-amazon-s3/
Use "Import from S3" to import the dump created in step 1. This automatically requires you to create a new table.
Use this node js module : copy-dynamodb-table
This is a little script I made to copy the contents of one table to another.
It's based on the AWS-SDK v3. Not sure how well it would scale to big tables but as a quick and dirty solution it does the job.
It gets your AWS credentials from a profile in ~/.aws/credentials change default to the name of the profile you want to use.
Other than that it takes two args one for the source table and one for destination
const { fromIni } = require("#aws-sdk/credential-providers");
const { DynamoDBClient, ScanCommand, PutItemCommand } = require("#aws-sdk/client-dynamodb");
const ddbClient = new DynamoDBClient({
credentials: fromIni({profile: "default"}),
region: "eu-west-1",
});
const args = process.argv.slice(2);
console.log(args)
async function main() {
const { Items } = await ddbClient.send(
new ScanCommand({
TableName: args[0],
})
);
console.log("Successfully scanned table")
console.log("Copying", Items.length, "Items")
const putPromises = [];
Items.forEach((item) => {
putPromises.push(
ddbClient.send(
new PutItemCommand({
TableName: args[1],
Item: item,
})
)
);
});
await Promise.all(putPromises);
console.log("Successfully copied table")
}
main();
Usage
node copy-table.js <source_table_name> <destination_table_name>
Python + boto3 🚀
The script is idempotent as far as you maintain the same Keys.
import boto3
def migrate(source, target):
dynamo_client = boto3.client('dynamodb', region_name='us-east-1')
dynamo_target_client = boto3.client('dynamodb', region_name='us-west-2')
dynamo_paginator = dynamo_client.get_paginator('scan')
dynamo_response = dynamo_paginator.paginate(
TableName=source,
Select='ALL_ATTRIBUTES',
ReturnConsumedCapacity='NONE',
ConsistentRead=True
)
for page in dynamo_response:
for item in page['Items']:
dynamo_target_client.put_item(
TableName=target,
Item=item
)
if __name__ == '__main__':
migrate('awesome-v1', 'awesome-v2')
On November 29th, 2017 Global Tables was introduced. This may be useful depending on your use case, which may not be the same as the original question. Here are a few snippets from the blog post:
Global Tables – You can now create tables that are automatically replicated across two or more AWS Regions, with full support for multi-master writes, with a couple of clicks. This gives you the ability to build fast, massively scaled applications for a global user base without having to manage the replication process.
...
You do not need to make any changes to your existing code. You simply send write requests and eventually consistent read requests to a DynamoDB endpoint in any of the designated Regions (writes that are associated with strongly consistent reads should share a common endpoint). Behind the scenes, DynamoDB implements multi-master writes and ensures that the last write to a particular item prevails. When you use Global Tables, each item will include a timestamp attribute representing the time of the most recent write. Updates are propagated to other Regions asynchronously via DynamoDB Streams and are typically complete within one second (you can track this using the new ReplicationLatency and PendingReplicationCount metrics).
Another option is to download the table as a .csv file and upload it with the following snippet of code.
This also eliminates the need for providing your AWS credentials to a packages such as the one #ezzat suggests.
Create a new folder and add the following two files and your exported table
Edit uploadToDynamoDB.js and add the filename of the exported table and your table name
Run npm install in the folder
Run node uploadToDynamodb.js
File: Package.json
{
"name": "uploadtodynamodb",
"version": "1.0.0",
"description": "",
"main": "uploadToDynamoDB.js",
"author": "",
"license": "ISC",
"dependencies": {
"async": "^3.1.1",
"aws-sdk": "^2.624.0",
"csv-parse": "^4.8.5",
"fs": "0.0.1-security",
"lodash": "^4.17.15",
"uuid": "^3.4.0"
}
}
File: uploadToDynamoDB.js
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var _ = require('lodash')
var AWS = require('aws-sdk');
// If your table is in another region, make sure to update this
AWS.config.update({ region: "eu-central-1" });
var ddb = new AWS.DynamoDB({ apiVersion: '2012-08-10' });
var csv_filename = "./TABLE_CSV_EXPORT_FILENAME.csv";
var tableName = "TABLENAME"
function prepareData(data_chunk) {
const items = data_chunk.map(obj => {
const keys = Object.keys(obj)
let attr = Object.values(obj)
attr = attr.map(a => {
let newAttr;
// Can we make this an integer
if (isNaN(Number(a))) {
newAttr = { "S": a }
} else {
newAttr = { "N": a }
}
return newAttr
})
let item = _.zipObject(keys, attr)
return {
PutRequest: {
Item: item
}
}
})
var params = {
RequestItems: {
[tableName]: items
}
};
return params
}
rs = fs.createReadStream(csv_filename);
parser = parse({
columns : true,
delimiter : ','
}, function(err, data) {
var split_arrays = [], size = 25;
while (data.length > 0) {
split_arrays.push(data.splice(0, size));
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, function(item_data, callback) {
const params = prepareData(item_data)
ddb.batchWriteItem(
params,
function (err, data) {
if (err) {
console.log("Error", err);
} else {
console.log("Success", data);
}
});
}, function() {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
It's been a very long time since the question was posted and AWS has been continuously improvising features. At the time of writing this answer, we have the option to export the Table to S3 bucket then use the import feature to import this data from S3 into a new table which automatically will re-create a new table with the data. Plese refer this blog for more idea on export & import
Best part is that you get to change the name, PK or SK.
Note: You have to enable PITR (might incur additional costs). Always best to refer documents.
Here is another simple python util script for this: ddb_table_copy.py. I use it often.
usage: ddb_table_copy.py [-h] [--dest-table DEST_TABLE] [--dest-file DEST_FILE] source_table
Copy all DynamoDB items from SOURCE_TABLE to either DEST_TABLE or DEST_FILE. Useful for migrating data during a stack teardown/re-creation.
positional arguments:
source_table Name of source table in DynamoDB.
optional arguments:
-h, --help show this help message and exit
--dest-table DEST_TABLE
Name of destination table in DynamoDB.
--dest-file DEST_FILE
2) a valid file path string to save the items to, e.g. 'items.json'.