How Do I Make a Faster Riak MapReduce Query? - mapreduce

How can we make our MapReduce Queries Faster?
We have built an application using a five node Riak DB cluster.
Our data model is composed of three buckets: matches, leagues, and teams.
Matches contains links to leagues and teams:
Model
var match = {
id: matchId,
leagueId: meta.leagueId,
homeTeamId: meta.homeTeamId,
awayTeamId: meta.awayTeamId,
startTime: m.match.startTime,
firstHalfStartTime: m.match.firstHalfStartTime,
secondHalfStartTime: m.match.secondHalfStartTime,
score: {
goals: {
a: 1*safeGet(m.match, 'score.goals.a'),
b: 1*safeGet(m.match, 'score.goals.b')
},
corners: {
a: 1*safeGet(m.match, 'score.corners.a'),
b: 1*safeGet(m.match, 'score.corners.b')
}
}
};
var options = {
index: {
leagueId: match.leagueId,
teamId: [match.homeTeamId, match.awayTeamId],
startTime: match.startTime || match.firstHalfStartTime || match.secondHalfStartTime
},
links: [
{ bucket: 'leagues', key: match.leagueId, tag: 'league' },
{ bucket: 'teams', key: match.homeTeamId, tag: 'home' },
{ bucket: 'teams', key: match.awayTeamId, tag: 'away' }
]
};
match.model = 'match';
modelCache.save('matches', match.id, match, options, callback);
Queries
We write a query that returns results from several buckets, one way is to query each bucket separately. The other way is to use links to combine results from a single query.
Two versions of the query we tried both take over a second, no matter how small our bucket size.
The first version uses two map phases, which we modeled after this post (Practical Map-Reduce: Forwarding and Collecting).
#!/bin/bash
curl -X POST \
-H "content-type: application/json" \
-d #- \
http://localhost:8091/mapred \
<<EOF
{
"inputs":{
"bucket":"matches",
"index":"startTime_bin",
"start":"2012-10-22T23:00:00",
"end":"2012-10-24T23:35:00"
},
"query": [
{"map":{"language": "javascript", "source":"
function(value, keydata, arg){
var match = Riak.mapValuesJson(value)[0];
var links = value.values[0].metadata.Links;
var result = links.map(function(l) {
return [l[0], l[1], match];
});
return result;
}
"}
},
{"map":{"language": "javascript", "source": "
function(value, keydata, arg) {
var doc = Riak.mapValuesJson(value)[0];
return [doc, keydata];
}
"}
},
{"reduce":{
"language": "javascript",
"source":"
function(values) {
var merged = {};
values.forEach(function(v) {
if(!merged[v.id]) {
merged[v.id] = v;
}
});
var results = [];
for(key in merged) {
results.push(merged[key]);
}
return results;
}
"
}
}
]
}
EOF
In the second version we do four separate Map-Reduce queries to get the objects from the three buckets:
async.series([
//First get all matches
function(callback) {
db.mapreduce
.add(inputs)
.map(function (val, key, arg) {
var data = Riak.mapValuesJson(val)[0];
if(arg.leagueId && arg.leagueId != data.leagueId) {
return [];
}
var d = new Date();
var date = data.startTime || data.firstHalfStartTime || data.secondHalfStartTime;
d.setFullYear(date.substring(0, 4));
d.setMonth(date.substring(5, 7) - 1);
d.setDate(date.substring(8, 10));
d.setHours(date.substring(11, 13));
d.setMinutes(date.substring(14, 16));
d.setSeconds(date.substring(17, 19));
d.setMilliseconds(0);
startTimestamp = d.getTime();
var short = {
id: data.id,
l: data.leagueId,
h: data.homeTeamId,
a: data.awayTeamId,
t: startTimestamp,
s: data.score,
c: startTimestamp
};
return [short];
}, {leagueId: query.leagueId, page: query.page}).reduce(function (val, key) {
return val;
}).run(function (err, matches) {
matches.forEach(function(match) {
result.match[match.id] = match; //Should maybe filter this
leagueIds.push(match.l);
teamIds.push(match.h);
teamIds.push(match.a);
});
callback();
});
},
//Then get all leagues, teams and lines in parallel
function(callback) {
async.parallel([
//Leagues
function(callback) {
db.getMany('leagues', leagueIds, function(err, leagues) {
if (err) { callback(err); return; }
leagues.forEach(function(league) {
visibleLeagueIds[league.id] = true;
result.league[league.id] = {
r: league.regionId,
n: league.name,
s: league.name
};
});
callback();
});
},
//Teams
function(callback) {
db.getMany('teams', teamIds, function(err, teams) {
if (err) { callback(err); return; }
teams.forEach(function(team) {
result.team[team.id] = {
n: team.name,
h: team.name,
s: team.stats
};
});
callback();
});
}
], callback);
}
], function(err) {
if (err) { callback(err); return; }
_.each(regionModel.getAll(), function(region) {
result.region[region.id] = {
id: region.id,
c: 'https://d1goqbu19rcwi8.cloudfront.net/icons/silk-flags/' + region.icon + '.png',
n: region.name
};
});
var response = {
success: true,
result: {
modelRecords: result,
paging: {
page: query.page,
pageSize: 50,
total: result.match.length
},
time: moment().diff(a)/1000.00,
visibleLeagueIds: visibleLeagueIds
}
};
callback(null, JSON.stringify(response, null, '\t'));
});
How do we make these queries faster?
Additional info:
We are using riak-js and node.js to run our queries.

One way to make it at least a bit faster would be to deploy the JavaScript mapreduce functions to the server instead of passing them through as part of the job. (see description of js_source_dir parameter here). This is usually recommended if you have a JavaScript functions that you run repeatedly.
As there is some overhead associated with running JavaScript mapreduce functions compared to native ones implemented in Erlang, using non-JavaScript functions where possible may also help.
The two map phase functions in your first query appear to be designed to work around the limitation that a normal linking phase (which I believe is more efficient) does not pass on the record being processed (the matches record). The first function includes all the links and passes on the match data as additional data in JSON form, while the second passes on the data of the match as well as the linked record in JSON form.
I have written a simple Erlang function that includes all links as well as the ID of the record passed in. This could be used together with the native Erlang function riak_kv_mapreduce:map_object_value to replace the two map phase functions in your first example, removing some of the JavaScript usage. As in the existing solution, I would expect you to receive a number of duplicates as several matches may link to the same league/team.
-module(riak_mapreduce_example).
-export([map_link/3]).
%% #spec map_link(riak_object:riak_object(), term(), term()) ->
%% [{{Bucket :: binary(), Key :: binary()}, Props :: term()}]
%% #doc map phase function for adding linked records to result set
map_link({error, notfound}, _, _) ->
[];
map_link(RiakObject, Props, _) ->
Bucket = riak_object:bucket(RiakObject),
Key = riak_object:key(RiakObject),
Meta = riak_object:get_metadata(RiakObject),
Current = [{{Bucket, Key}, Props}],
Links = case dict:find(<<"Links">>, Meta) of
{ok, List} ->
[{{B, K}, Props} || {{B, K}, _Tag} <- List];
error ->
[]
end,
lists:append([Current, Links]).
The results of these can either be sent back to the client for aggregation or passed into a reduce phase function as in the example you provided.
The example function would need to be compiled and installed on all nodes, and may require a restart.
Another way to improve performance (that very well may not be an option for you) would perhaps be alter the data model in order to avoid having to use mapreduce queries for performance critical queries altogether.

Related

Function scan in DynamoDB doesn't bring some of the results

I got a function in AWS Lambda that lists every patient in a table from DynamoDB. I realized that some items from the table were not on the list. This is my function to list:
module.exports.listPatients = async (event) => {
try {
const queryString = {
limit: 5,
...event.queryStringParameters,
};
const { limit, next, name } = queryString;
const localParams = {
...patientsParams,
Limit: limit,
FilterExpression: "contains(full_name, :full_name)",
ExpressionAttributeValues: { ":full_name": name },
};
if (next) {
localParams.ExclusiveStartKey = {
id: next,
};
}
const data = await dynamoDb.scan(localParams).promise();
const nextToken = data.LastEvaluatedKey ? data.LastEvaluatedKey.id : "";
const result = {
items: data.Items,
next_token: nextToken,
};
return {
statusCode: 200,
body: JSON.stringify(result),
};
} catch (error) {
console.log("Error: ", error);
return {
statusCode: error.statusCode ? error.statusCode : 500,
body: JSON.stringify({
error: error.name ? error.name : "Exception",
message: error.message ? error.message : "Unknown error",
}),
};
}
};
Am I missing something?
I tried with and without a limit, removed the filters, and yet nothing.
I tested one of the ids with get() to test with the server can find one of those who are missing, and it worked.
I am using Serverless to deploy the code, and when I try offline, it's working.
Stackoverflow recommended this post when writing my question, but I am using DynamoDB.DocumentClient without specifying the full attribute type in the filter expression:
How to scan in DynamoDB without primary sort key with Nodejs
Looks like you are paginating using scan(). Using query() with some Global Secondary Indexes and ScanIndexForward would give you a much better performance. scan() doesn't scale well when your data grows.

DynamoDB JavaScript PutItemCommand is neither failing nor working

Please note: although this question mentions AWS SAM, it is 100% a DynamoDB JavaScript SDK question at heart and can be answered by anyone with experience writing JavaScript Lambdas (or any client-side apps) against DynamoDB using the AWS DynamoDB client/SDK.
So I used AWS SAM to provision a new DynamoDB table with the following attributes:
FeedbackDynamoDB:
Type: AWS::DynamoDB::Table
Properties:
TableName: commentary
AttributeDefinitions:
- AttributeName: id
AttributeType: S
KeySchema:
- AttributeName: id
KeyType: HASH
ProvisionedThroughput:
ReadCapacityUnits: 5
WriteCapacityUnits: 5
StreamSpecification:
StreamViewType: NEW_IMAGE
This configuration successfully creates a DynamoDB table called commentary. However, when I view this table in the DynamoDB web console, I noticed a few things:
it has a partition key of id (type S)
it has no sort key
it has no (0) indexes
it has a read/write capacity mode of "5"
I'm not sure if this raises any red flags with anyone but I figured I would include those details, in case I've configured anything incorrectly.
Now then, I have a JavaScript (TypeScript) Lambda that instantiates a DynamoDB client (using the JavaScript SDK) and attempts to add a record/item to this table:
// this code is in a file named app.ts:
import { APIGatewayProxyEvent, APIGatewayProxyResult } from 'aws-lambda';
import { User, allUsers } from './users';
import { Commentary } from './commentary';
import { PutItemCommand } from "#aws-sdk/client-dynamodb";
import { DynamoDBClient } from "#aws-sdk/client-dynamodb";
export const lambdaHandler = async (event: APIGatewayProxyEvent): Promise<APIGatewayProxyResult> => {
try {
const ddbClient = new DynamoDBClient({ region: "us-east-1" });
let status: number = 200;
let responseBody: string = "\"message\": \"hello world\"";
const { id, content, createdAt, providerId, receiverId } = JSON.parse(event.body);
const commentary = new Commentary(id, content, createdAt, providerId, receiverId);
console.log("deserialized this into commentary");
console.log("and the deserialized commentary has content of: " + commentary.getContent());
await provideCommentary(ddbClient, commentary);
responseBody = "\"message\": \"received commentary -- check dynamoDb!\"";
return {
statusCode: status,
body: responseBody
};
} catch (err) {
console.log(err);
return {
statusCode: 500,
body: JSON.stringify({
message: err.stack,
}),
};
}
};
const provideCommentary = async (ddbClient: DynamoDBClient, commentary: Commentary) => {
const params = {
TableName: "commentary",
Item: {
id: {
S: commentary.getId()
},
content: {
S: commentary.getContent()
},
createdAt: {
S: commentary.getCreatedAt()
},
providerId: {
N: commentary.getProviderId()
},
receiverId: {
N: commentary.getReceiverId()
}
}
};
console.log("about to try to insert commentary into dynamo...");
try {
console.log("wait for it...")
const rc = await ddbClient.send(new PutItemCommand(params));
console.log("DDB response:", rc);
} catch (err) {
console.log("hmmm something awry. something....in the mist");
console.log("Error", err.stack);
throw err;
}
};
Where commentary.ts is:
class Commentary {
private id: string;
private content: string;
private createdAt: Date;
private providerId: number;
private receiverId: number;
constructor(id: string, content: string, createdAt: Date, providerId: number, receiverId: number) {
this.id = id;
this.content = content;
this.createdAt = createdAt;
this.providerId = providerId;
this.receiverId = receiverId;
}
public getId(): string {
return this.id;
}
public getContent(): string {
return this.content;
}
public getCreatedAt(): Date {
return this.createdAt;
}
public getProviderId(): number {
return this.providerId;
}
public getReceiverId(): number {
return this.receiverId;
}
}
export { Commentary };
When I update the Lambda with this handler code, and hit the Lambda with the following curl (the Lambda is invoked by an API Gateway URL that I can hit via curl/http):
curl -i --request POST 'https://<my-api-gateway>.execute-api.us-east-1.amazonaws.com/Stage/feedback' \
--header 'Content-Type: application/json' -d '{"id":"123","content":"test feedback","createdAt":"2022-12-02T08:45:26.261-05:00","providerId":457,"receiverId":789}'
I get the following HTTP 500 response:
{"message":"SerializationException: NUMBER_VALUE cannot be converted to String\n
Am I passing it a bad request body (in the curl) or do I need to tweak something in app.ts and/or commentary.ts?
Interestingly the DynamoDB API expects numerical fields of items as strings. For example:
"N": "123.45"
The doc says;
Numbers are sent across the network to DynamoDB as strings, to maximize compatibility across languages and libraries. However, DynamoDB treats them as number type attributes for mathematical operations.
Have you tried sending your input with the numerical parameters as strings as shown below? (See providerId and receiverId)
{
"id":"123",
"content":"test feedback",
"createdAt":"2022-12-02T08:45:26.261-05:00",
"providerId":"457",
"receiverId":"789"
}
You can convert these IDs into string when you're populating your input Item:
providerId: {
N: String(commentary.getProviderId())
},
receiverId: {
N: String(commentary.getReceiverId())
}
You could also use .toString() but then you'd get errors if the field is not set (null or undefined).
Try using a promise to see the outcome:
client.send(command).then(
(data) => {
// process data.
},
(error) => {
// error handling.
}
);
Everything seems alright with your table setup, I believe it's Lambda async issue with the JS sdk. I'm guessing Lambda is not waiting on your code and exiting early. Can you include your full lambda code.

How can I import bulk data from a CSV file into DynamoDB?

I am trying to import a CSV file data into AWS DynamoDB.
Here's what my CSV file looks like:
first_name last_name
sri ram
Rahul Dravid
JetPay Underwriter
Anil Kumar Gurram
In which language do you want to import the data? I just wrote a function in Node.js that can import a CSV file into a DynamoDB table. It first parses the whole CSV into an array, splits array into (25) chunks and then batchWriteItem into table.
Note: DynamoDB only allows writing up to 25 records at a time in batchinsert. So we have to split our array into chunks.
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
var csv_filename = "YOUR_CSV_FILENAME_WITH_ABSOLUTE_PATH";
rs = fs.createReadStream(csv_filename);
parser = parse({
columns : true,
delimiter : ','
}, function(err, data) {
var split_arrays = [], size = 25;
while (data.length > 0) {
split_arrays.push(data.splice(0, size));
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, function(item_data, callback) {
ddb.batchWriteItem({
"TABLE_NAME" : item_data
}, {}, function(err, res, cap) {
console.log('done going next');
if (err == null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, function() {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
Updated 2019 Javascript code
I didn't have much luck with any of the Javascript code samples above. Starting with Hassan Siddique answer above, I've updated to the latest API, included sample credential code, moved all user config to the top, added uuid()'s when missing and stripped out blank strings.
const fs = require('fs');
const parse = require('csv-parse');
const async = require('async');
const uuid = require('uuid/v4');
const AWS = require('aws-sdk');
// --- start user config ---
const AWS_CREDENTIALS_PROFILE = 'serverless-admin';
const CSV_FILENAME = "./majou.csv";
const DYNAMODB_REGION = 'eu-central-1';
const DYNAMODB_TABLENAME = 'entriesTable';
// --- end user config ---
const credentials = new AWS.SharedIniFileCredentials({
profile: AWS_CREDENTIALS_PROFILE
});
AWS.config.credentials = credentials;
const docClient = new AWS.DynamoDB.DocumentClient({
region: DYNAMODB_REGION
});
const rs = fs.createReadStream(CSV_FILENAME);
const parser = parse({
columns: true,
delimiter: ','
}, function(err, data) {
var split_arrays = [],
size = 25;
while (data.length > 0) {
split_arrays.push(data.splice(0, size));
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, function(item_data, callback) {
const params = {
RequestItems: {}
};
params.RequestItems[DYNAMODB_TABLENAME] = [];
item_data.forEach(item => {
for (key of Object.keys(item)) {
// An AttributeValue may not contain an empty string
if (item[key] === '')
delete item[key];
}
params.RequestItems[DYNAMODB_TABLENAME].push({
PutRequest: {
Item: {
id: uuid(),
...item
}
}
});
});
docClient.batchWrite(params, function(err, res, cap) {
console.log('done going next');
if (err == null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, function() {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
I've created a gem for this.
Now you can install it by running gem install dynamocli, then you can use the command:
dynamocli import your_data.csv --to your_table
Here is the link to the source code: https://github.com/matheussilvasantos/dynamocli
As a lowly dev without perms to create a Data Pipeline, I had to use this javascript. Hassan Sidique's code was slightly out of date, but this worked for me:
var fs = require('fs');
var parse = require('csv-parse');
var async = require('async');
const AWS = require('aws-sdk');
const dynamodbDocClient = new AWS.DynamoDB({ region: "eu-west-1" });
var csv_filename = "./CSV.csv";
rs = fs.createReadStream(csv_filename);
parser = parse({
columns : true,
delimiter : ','
}, function(err, data) {
var split_arrays = [], size = 25;
while (data.length > 0) {
//split_arrays.push(data.splice(0, size));
let cur25 = data.splice(0, size)
let item_data = []
for (var i = cur25.length - 1; i >= 0; i--) {
const this_item = {
"PutRequest" : {
"Item": {
// your column names here will vary, but you'll need do define the type
"Title": {
"S": cur25[i].Title
},
"Col2": {
"N": cur25[i].Col2
},
"Col3": {
"N": cur25[i].Col3
}
}
}
};
item_data.push(this_item)
}
split_arrays.push(item_data);
}
data_imported = false;
chunk_no = 1;
async.each(split_arrays, (item_data, callback) => {
const params = {
RequestItems: {
"tagPerformance" : item_data
}
}
dynamodbDocClient.batchWriteItem(params, function(err, res, cap) {
if (err === null) {
console.log('Success chunk #' + chunk_no);
data_imported = true;
} else {
console.log(err);
console.log('Fail chunk #' + chunk_no);
data_imported = false;
}
chunk_no++;
callback();
});
}, () => {
// run after loops
console.log('all data imported....');
});
});
rs.pipe(parser);
You can use AWS Data Pipeline which is for things like this. You can upload your csv file to S3 and then use Data Pipeline to retrieve and populate a DynamoDB table. They have a step-by-step tutorial.
I wrote a tool to do this using parallel execution that requires no dependencies or developer tooling installed on the machine (it's written in Go).
It can handle:
Comma separated (CSV) files
Tab separated (TSV) files
Large files
Local files
Files on S3
Parallel imports within AWS using AWS Step Functions to import > 4M rows per minute
No dependencies (no need for .NET, Python, Node.js, Docker, AWS CLI etc.)
It's available for MacOS, Linux, Windows and Docker: https://github.com/a-h/ddbimport
Here's the results of my tests showing that it can import a lot faster in parallel using AWS Step Functions.
I'm describing the tool in more detail at AWS Community Summit on the 15th May 2020 at 1155 BST - https://www.twitch.tv/awscomsum
Before getting to my code, some notes on testing this locally
I recommend using a local version of DynamoDB, in case you want to sanity check this before you start incurring charges and what not. I made some small modifications before posting this, so be sure to test with whatever means make sense to you. There is a fake batch upload job I commented out, which you could use in lieu of any DynamoDB service, remote or local, to verify in stdout that this is working to your needs.
dynamodb-local
See dynamodb-local on npmjs or manual install
If you went the manual install route, you can start dynamodb-local with something like this:
java -Djava.library.path=<PATH_TO_DYNAMODB_LOCAL>/DynamoDBLocal_lib/\
-jar <PATH_TO_DYNAMODB_LOCAL>/DynamoDBLocal.jar\
-inMemory\
-sharedDb
The npm route may be simpler.
dynamodb-admin
Along with that, see dynamodb-admin.
I installed dynamodb-admin with npm i -g dynamodb-admin. It can then be run with:
dynamodb-admin
Using them:
dynamodb-local defaults to localhost:8000.
dynamodb-admin is a web page that defaults to localhost:8001. Once you launch these two services, open localhost:8001 in your browser to view and manipulate the database.
The script below doesn't create the database. Use dynamodb-admin for this.
Credit goes to...
Ben Nadel.
The code
I'm not as experienced with JS & Node.js as I am with other languages, so please forgive any JS faux pas.
You'll notice each group of concurrent batches is purposely slowed down by 900ms. This was a hacky solution, and I'm leaving it here to serve as an example (and because of laziness, and because you're not paying me).
If you increase MAX_CONCURRENT_BATCHES, you will want to calculate the appropriate delay amount based on your WCU, item size, batch size, and the new concurrency level.
Another approach would be to turn on Auto Scaling and implement exponential backoff for each failed batch. Like I mention below in one of the comments, this really shouldn't be necessary with some back-of-the-envelope calculations to figure out how many writes you can actually do, given your WCU limit and data size, and just let your code run at a predictable rate the entire time.
You might wonder why I didn't just let AWS SDK handle concurrency. Good question. Probably would have made this slightly simpler. You could experiment by applying the MAX_CONCURRENT_BATCHES to the maxSockets config option, and modifying the code that creates arrays of batches so that it only passes individual batches forward.
/**
* Uploads CSV data to DynamoDB.
*
* 1. Streams a CSV file line-by-line.
* 2. Parses each line to a JSON object.
* 3. Collects batches of JSON objects.
* 4. Converts batches into the PutRequest format needed by AWS.DynamoDB.batchWriteItem
* and runs 1 or more batches at a time.
*/
const AWS = require("aws-sdk")
const chalk = require('chalk')
const fs = require('fs')
const split = require('split2')
const uuid = require('uuid')
const through2 = require('through2')
const { Writable } = require('stream');
const { Transform } = require('stream');
const CSV_FILE_PATH = __dirname + "/../assets/whatever.csv"
// A whitelist of the CSV columns to ingest.
const CSV_KEYS = [
"id",
"name",
"city"
]
// Inadequate WCU will cause "insufficient throughput" exceptions, which in this script are not currently
// handled with retry attempts. Retries are not necessary as long as you consistently
// stay under the WCU, which isn't that hard to predict.
// The number of records to pass to AWS.DynamoDB.DocumentClient.batchWrite
// See https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html
const MAX_RECORDS_PER_BATCH = 25
// The number of batches to upload concurrently.
// https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/node-configuring-maxsockets.html
const MAX_CONCURRENT_BATCHES = 1
// MAKE SURE TO LAUNCH `dynamodb-local` EXTERNALLY FIRST IF USING LOCALHOST!
AWS.config.update({
region: "us-west-1"
,endpoint: "http://localhost:8000" // Comment out to hit live DynamoDB service.
});
const db = new AWS.DynamoDB()
// Create a file line reader.
var fileReaderStream = fs.createReadStream(CSV_FILE_PATH)
var lineReaderStream = fileReaderStream.pipe(split())
var linesRead = 0
// Attach a stream that transforms text lines into JSON objects.
var skipHeader = true
var csvParserStream = lineReaderStream.pipe(
through2(
{
objectMode: true,
highWaterMark: 1
},
function handleWrite(chunk, encoding, callback) {
// ignore CSV header
if (skipHeader) {
skipHeader = false
callback()
return
}
linesRead++
// transform line into stringified JSON
const values = chunk.toString().split(',')
const ret = {}
CSV_KEYS.forEach((keyName, index) => {
ret[keyName] = values[index]
})
ret.line = linesRead
console.log(chalk.cyan.bold("csvParserStream:",
"line:", linesRead + ".",
chunk.length, "bytes.",
ret.id
))
callback(null, ret)
}
)
)
// Attach a stream that collects incoming json lines to create batches.
// Outputs an array (<= MAX_CONCURRENT_BATCHES) of arrays (<= MAX_RECORDS_PER_BATCH).
var batchingStream = (function batchObjectsIntoGroups(source) {
var batchBuffer = []
var idx = 0
var batchingStream = source.pipe(
through2.obj(
{
objectMode: true,
writableObjectMode: true,
highWaterMark: 1
},
function handleWrite(item, encoding, callback) {
var batchIdx = Math.floor(idx / MAX_RECORDS_PER_BATCH)
if (idx % MAX_RECORDS_PER_BATCH == 0 && batchIdx < MAX_CONCURRENT_BATCHES) {
batchBuffer.push([])
}
batchBuffer[batchIdx].push(item)
if (MAX_CONCURRENT_BATCHES == batchBuffer.length &&
MAX_RECORDS_PER_BATCH == batchBuffer[MAX_CONCURRENT_BATCHES-1].length)
{
this.push(batchBuffer)
batchBuffer = []
idx = 0
} else {
idx++
}
callback()
},
function handleFlush(callback) {
if (batchBuffer.length) {
this.push(batchBuffer)
}
callback()
}
)
)
return (batchingStream);
})(csvParserStream)
// Attach a stream that transforms batch buffers to collections of DynamoDB batchWrite jobs.
var databaseStream = new Writable({
objectMode: true,
highWaterMark: 1,
write(batchBuffer, encoding, callback) {
console.log(chalk.yellow(`Batch being processed.`))
// Create `batchBuffer.length` batchWrite jobs.
var jobs = batchBuffer.map(batch =>
buildBatchWriteJob(batch)
)
// Run multiple batch-write jobs concurrently.
Promise
.all(jobs)
.then(results => {
console.log(chalk.bold.red(`${batchBuffer.length} batches completed.`))
})
.catch(error => {
console.log( chalk.red( "ERROR" ), error )
callback(error)
})
.then( () => {
console.log( chalk.bold.red("Resuming file input.") )
setTimeout(callback, 900) // slow down the uploads. calculate this based on WCU, item size, batch size, and concurrency level.
})
// return false
}
})
batchingStream.pipe(databaseStream)
// Builds a batch-write job that runs as an async promise.
function buildBatchWriteJob(batch) {
let params = buildRequestParams(batch)
// This was being used temporarily prior to hooking up the script to any dynamo service.
// let fakeJob = new Promise( (resolve, reject) => {
// console.log(chalk.green.bold( "Would upload batch:",
// pluckValues(batch, "line")
// ))
// let t0 = new Date().getTime()
// // fake timing
// setTimeout(function() {
// console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
// resolve()
// }, 300)
// })
// return fakeJob
let promise = new Promise(
function(resolve, reject) {
let t0 = new Date().getTime()
let printItems = function(msg, items) {
console.log(chalk.green.bold(msg, pluckValues(batch, "id")))
}
let processItemsCallback = function (err, data) {
if (err) {
console.error(`Failed at batch: ${pluckValues(batch, "line")}, ${pluckValues(batch, "id")}`)
console.error("Error:", err)
reject()
} else {
var params = {}
params.RequestItems = data.UnprocessedItems
var numUnprocessed = Object.keys(params.RequestItems).length
if (numUnprocessed != 0) {
console.log(`Encountered ${numUnprocessed}`)
printItems("Retrying unprocessed items:", params)
db.batchWriteItem(params, processItemsCallback)
} else {
console.log(chalk.dim.yellow.italic(`Batch upload time: ${new Date().getTime() - t0}ms`))
resolve()
}
}
}
db.batchWriteItem(params, processItemsCallback)
}
)
return (promise)
}
// Build request payload for the batchWrite
function buildRequestParams(batch) {
var params = {
RequestItems: {}
}
params.RequestItems.Provider = batch.map(obj => {
let item = {}
CSV_KEYS.forEach((keyName, index) => {
if (obj[keyName] && obj[keyName].length > 0) {
item[keyName] = { "S": obj[keyName] }
}
})
return {
PutRequest: {
Item: item
}
}
})
return params
}
function pluckValues(batch, fieldName) {
var values = batch.map(item => {
return (item[fieldName])
})
return (values)
}
Here's my solution. I relied on the fact that there was some type of header indicating what column did what. Simple and straight forward. No pipeline nonsense for a quick upload..
import os, json, csv, yaml, time
from tqdm import tqdm
# For Database
import boto3
# Variable store
environment = {}
# Environment variables
with open("../env.yml", 'r') as stream:
try:
environment = yaml.load(stream)
except yaml.YAMLError as exc:
print(exc)
# Get the service resource.
dynamodb = boto3.resource('dynamodb',
aws_access_key_id=environment['AWS_ACCESS_KEY'],
aws_secret_access_key=environment['AWS_SECRET_KEY'],
region_name=environment['AWS_REGION_NAME'])
# Instantiate a table resource object without actually
# creating a DynamoDB table. Note that the attributes of this table
# are lazy-loaded: a request is not made nor are the attribute
# values populated until the attributes
# on the table resource are accessed or its load() method is called.
table = dynamodb.Table('data')
# Header
header = []
# Open CSV
with open('export.csv') as csvfile:
reader = csv.reader(csvfile,delimiter=',')
# Parse Each Line
with table.batch_writer() as batch:
for index,row in enumerate(tqdm(reader)):
if index == 0:
#save the header to be used as the keys
header = row
else:
if row == "":
continue
# Create JSON Object
# Push to DynamoDB
data = {}
# Iterate over each column
for index,entry in enumerate(header):
data[entry.lower()] = row[index]
response = batch.put_item(
Item=data
)
# Repeat
Another quick workaround is to load your CSV to RDS or any other mysql instance first, which is quite easy to do (https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html) and then use DMS (AWS Database Migration Service) to load the entire data to dynamodb. You'll have to create a role for DMS before you can load the data. But this works wonderfully without having to run any scripts.
I used https://github.com/GorillaStack/dynamodb-csv-export-import. It is super simple and worked like a charm. I just followed the instructions in the README:
# Install globally
npm i -g #gorillastack/dynamodb-csv-export-import
# Set AWS region
export AWS_DEFAULT_REGION=us-east-1
# Use it for your CSV and dynamo table
dynamodb-csv-export-import my-exported-file.csv MyDynamoDbTableName
Here's a simpler solution. And with this solution, you don't have to remove empty string attributes.
require('./env'); //contains aws secret/access key
const parse = require('csvtojson');
const AWS = require('aws-sdk');
// --- start user config ---
const CSV_FILENAME = __dirname + "/002_subscribers_copy_from_db.csv";
const DYNAMODB_TABLENAME = '002-Subscribers';
// --- end user config ---
//You could add your credentials here or you could
//store it in process.env like I have done aws-sdk
//would detect the keys in the environment
AWS.config.update({
region: process.env.AWS_REGION
});
const db = new AWS.DynamoDB.DocumentClient({
convertEmptyValues: true
});
(async ()=>{
const json = await parse().fromFile(CSV_FILENAME);
//this is efficient enough if you're processing small
//amounts of data. If your data set is large then I
//suggest using dynamodb method .batchWrite() and send
//in data in chunks of 25 (the limit) and find yourself
//a more efficient loop if there is one
for(var i=0; i<json.length; i++){
console.log(`processing item number ${i+1}`);
let query = {
TableName: DYNAMODB_TABLENAME,
Item: json[i]
};
await db.put(query).promise();
/**
* Note: If "json" contains other nested objects, you would have to
* loop through the json and parse all child objects.
* likewise, you would have to convert all children into their
* native primitive types because everything would be represented
* as a string.
*/
}
console.log('\nDone.');
})();
One way of importing/exporting stuff:
"""
Batch-writes data from a file to a dynamo-db database.
"""
import json
import boto3
# Get items from DynamoDB table like this:
# aws dynamodb scan --table-name <table-name>
# Create dynamodb client.
client = boto3.client(
'dynamodb',
aws_access_key_id='',
aws_secret_access_key=''
)
with open('', 'r') as file:
data = json.loads(file.read())['Items']
# Execute write-data request for each item.
for item in data:
client.put_item(
TableName='',
Item=item
)
The simplest solution is probably to use a template / solution made by AWS:
Implementing bulk CSV ingestion to Amazon DynamoDB
https://aws.amazon.com/blogs/database/implementing-bulk-csv-ingestion-to-amazon-dynamodb/
With this approach, you use the template provided to create a CloudFormation stack including an S3 bucket, a Lambda function, and a new DynamoDB table. The lambda is triggered to run on upload to the S3 bucket and inserts into the table in batches.
In my case, I wanted to insert into an existing table, so I just changed the Lambda function's environment variable once the stack was created.
Follow the instruction in the following link to import data to existing tables in DynamoDB:
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/SampleData.LoadData.html
Please note, the name of the tables is what you must find here:
https://console.aws.amazon.com/dynamodbv2/home
And the name of the table is used inside the json file, the name of the json file itself is not important. For example I have a table as Country-kdezpod7qrap7nhpjghjj-staging, then for importing data to that table I must make a json file like this:
{
"Country-kdezpod7qrap7nhpjghjj-staging": [
{
"PutRequest": {
"Item": {
"id": {
"S": "ir"
},
"__typename": {
"S": "Country"
},
"createdAt": {
"S": "2021-01-04T12:32:09.012Z"
},
"name": {
"S": "Iran"
},
"self": {
"N": "1"
},
"updatedAt": {
"S": "2021-01-04T12:32:09.012Z"
}
}
}
}
]
}
If you don't know how to create the items for each PutRequest then you can create an item in your DB with mutation and then try to duplicate it, then it will show the structure of one item for you:
If you have a huge list of items in your CSV file, you can use the following npm tool to generate the json file:
https://www.npmjs.com/package/json-dynamo-putrequest
Then we can use the following command to import the data:
aws dynamodb batch-write-item --request-items file://Country.json
If it import the data successfully, you must see the following output:
{
"UnprocessedItems": {}
}
Also please note that with this method you can only have 25 PutRequest items in your array. So if you want to push 100 items you need to create 4 files.
You can try using batch writes and multiprocessing to speed up your bulk import.
import csv
import time
import boto3
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(4)
current_milli_time = lambda: int(round(time.time() * 1000))
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('table_name')
def add_users_in_batch(data):
with table.batch_writer() as batch:
for item in data:
batch.put_item(Item = item)
def run_batch_migration():
start = current_milli_time()
row_count = 0
batch = []
batches = []
with open(CSV_PATH, newline = '') as csvfile:
reader = csv.reader(csvfile, delimiter = '\t', quotechar = '|')
for row in reader:
row_count += 1
item = {
'email': row[0],
'country': row[1]
}
batch.append(item)
if row_count % 25 == 0:
batches.append(batch)
batch = []
batches.append(batch)
pool.map(add_users_in_batch, batches)
print('Number of rows processed - ', str(row_count))
end = current_milli_time()
print('Total time taken for migration : ', str((end - start) / 1000), ' secs')
if __name__ == "__main__":
run_batch_migration()
Try this. This is much simple and helpful.
You can now natively bulk import into DynamoDB in CSV, DynamoDB JSON or Amazon Ion formats. This requires your data to be present in an S3 bucket. No code required.
blog - https://aws.amazon.com/blogs/database/amazon-dynamodb-can-now-import-amazon-s3-data-into-a-new-table/
docs - https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/S3DataImport.HowItWorks.html
Key considerations while using this native feature particularly for CSV data:
You can specify the table's Partition Key (PK)/Sort Key (SK) and their data types, and all other CreateTable parameters
Feature currently supports only importing into a new table each time
Data with the same PK and SK will be overwritten (similar to a PutItem operation)
Except for the PK and SK, all other fields in the CSV will be considered as DynamoDB Strings. If this is not favorable, you can convert the data into DynamoDB JSON/Amazon Ion format before importing with explicit data types
Any Global Secondary Indexes created as part of the ImportTable operation will be populated free of cost. Import cost depends on uncompressed source data size.
GSIs created at Import time will also map data types as per source data. All non key attributes will still however be considered as DynamoDB Strings
ImportTable consumes no write capacity on the table, so you could create the table with 1 WCU and the import performance will be same as a ImportTable performed for table with 100K WCU

Advanced update using mongodb [duplicate]

In MongoDB, is it possible to update the value of a field using the value from another field? The equivalent SQL would be something like:
UPDATE Person SET Name = FirstName + ' ' + LastName
And the MongoDB pseudo-code would be:
db.person.update( {}, { $set : { name : firstName + ' ' + lastName } );
The best way to do this is in version 4.2+ which allows using the aggregation pipeline in the update document and the updateOne, updateMany, or update(deprecated in most if not all languages drivers) collection methods.
MongoDB 4.2+
Version 4.2 also introduced the $set pipeline stage operator, which is an alias for $addFields. I will use $set here as it maps with what we are trying to achieve.
db.collection.<update method>(
{},
[
{"$set": {"name": { "$concat": ["$firstName", " ", "$lastName"]}}}
]
)
Note that square brackets in the second argument to the method specify an aggregation pipeline instead of a plain update document because using a simple document will not work correctly.
MongoDB 3.4+
In 3.4+, you can use $addFields and the $out aggregation pipeline operators.
db.collection.aggregate(
[
{ "$addFields": {
"name": { "$concat": [ "$firstName", " ", "$lastName" ] }
}},
{ "$out": <output collection name> }
]
)
Note that this does not update your collection but instead replaces the existing collection or creates a new one. Also, for update operations that require "typecasting", you will need client-side processing, and depending on the operation, you may need to use the find() method instead of the .aggreate() method.
MongoDB 3.2 and 3.0
The way we do this is by $projecting our documents and using the $concat string aggregation operator to return the concatenated string.
You then iterate the cursor and use the $set update operator to add the new field to your documents using bulk operations for maximum efficiency.
Aggregation query:
var cursor = db.collection.aggregate([
{ "$project": {
"name": { "$concat": [ "$firstName", " ", "$lastName" ] }
}}
])
MongoDB 3.2 or newer
You need to use the bulkWrite method.
var requests = [];
cursor.forEach(document => {
requests.push( {
'updateOne': {
'filter': { '_id': document._id },
'update': { '$set': { 'name': document.name } }
}
});
if (requests.length === 500) {
//Execute per 500 operations and re-init
db.collection.bulkWrite(requests);
requests = [];
}
});
if(requests.length > 0) {
db.collection.bulkWrite(requests);
}
MongoDB 2.6 and 3.0
From this version, you need to use the now deprecated Bulk API and its associated methods.
var bulk = db.collection.initializeUnorderedBulkOp();
var count = 0;
cursor.snapshot().forEach(function(document) {
bulk.find({ '_id': document._id }).updateOne( {
'$set': { 'name': document.name }
});
count++;
if(count%500 === 0) {
// Excecute per 500 operations and re-init
bulk.execute();
bulk = db.collection.initializeUnorderedBulkOp();
}
})
// clean up queues
if(count > 0) {
bulk.execute();
}
MongoDB 2.4
cursor["result"].forEach(function(document) {
db.collection.update(
{ "_id": document._id },
{ "$set": { "name": document.name } }
);
})
You should iterate through. For your specific case:
db.person.find().snapshot().forEach(
function (elem) {
db.person.update(
{
_id: elem._id
},
{
$set: {
name: elem.firstname + ' ' + elem.lastname
}
}
);
}
);
Apparently there is a way to do this efficiently since MongoDB 3.4, see styvane's answer.
Obsolete answer below
You cannot refer to the document itself in an update (yet). You'll need to iterate through the documents and update each document using a function. See this answer for an example, or this one for server-side eval().
For a database with high activity, you may run into issues where your updates affect actively changing records and for this reason I recommend using snapshot()
db.person.find().snapshot().forEach( function (hombre) {
hombre.name = hombre.firstName + ' ' + hombre.lastName;
db.person.save(hombre);
});
http://docs.mongodb.org/manual/reference/method/cursor.snapshot/
Starting Mongo 4.2, db.collection.update() can accept an aggregation pipeline, finally allowing the update/creation of a field based on another field:
// { firstName: "Hello", lastName: "World" }
db.collection.updateMany(
{},
[{ $set: { name: { $concat: [ "$firstName", " ", "$lastName" ] } } }]
)
// { "firstName" : "Hello", "lastName" : "World", "name" : "Hello World" }
The first part {} is the match query, filtering which documents to update (in our case all documents).
The second part [{ $set: { name: { ... } }] is the update aggregation pipeline (note the squared brackets signifying the use of an aggregation pipeline). $set is a new aggregation operator and an alias of $addFields.
Regarding this answer, the snapshot function is deprecated in version 3.6, according to this update. So, on version 3.6 and above, it is possible to perform the operation this way:
db.person.find().forEach(
function (elem) {
db.person.update(
{
_id: elem._id
},
{
$set: {
name: elem.firstname + ' ' + elem.lastname
}
}
);
}
);
I tried the above solution but I found it unsuitable for large amounts of data. I then discovered the stream feature:
MongoClient.connect("...", function(err, db){
var c = db.collection('yourCollection');
var s = c.find({/* your query */}).stream();
s.on('data', function(doc){
c.update({_id: doc._id}, {$set: {name : doc.firstName + ' ' + doc.lastName}}, function(err, result) { /* result == true? */} }
});
s.on('end', function(){
// stream can end before all your updates do if you have a lot
})
})
update() method takes aggregation pipeline as parameter like
db.collection_name.update(
{
// Query
},
[
// Aggregation pipeline
{ "$set": { "id": "$_id" } }
],
{
// Options
"multi": true // false when a single doc has to be updated
}
)
The field can be set or unset with existing values using the aggregation pipeline.
Note: use $ with field name to specify the field which has to be read.
Here's what we came up with for copying one field to another for ~150_000 records. It took about 6 minutes, but is still significantly less resource intensive than it would have been to instantiate and iterate over the same number of ruby objects.
js_query = %({
$or : [
{
'settings.mobile_notifications' : { $exists : false },
'settings.mobile_admin_notifications' : { $exists : false }
}
]
})
js_for_each = %(function(user) {
if (!user.settings.hasOwnProperty('mobile_notifications')) {
user.settings.mobile_notifications = user.settings.email_notifications;
}
if (!user.settings.hasOwnProperty('mobile_admin_notifications')) {
user.settings.mobile_admin_notifications = user.settings.email_admin_notifications;
}
db.users.save(user);
})
js = "db.users.find(#{js_query}).forEach(#{js_for_each});"
Mongoid::Sessions.default.command('$eval' => js)
With MongoDB version 4.2+, updates are more flexible as it allows the use of aggregation pipeline in its update, updateOne and updateMany. You can now transform your documents using the aggregation operators then update without the need to explicity state the $set command (instead we use $replaceRoot: {newRoot: "$$ROOT"})
Here we use the aggregate query to extract the timestamp from MongoDB's ObjectID "_id" field and update the documents (I am not an expert in SQL but I think SQL does not provide any auto generated ObjectID that has timestamp to it, you would have to automatically create that date)
var collection = "person"
agg_query = [
{
"$addFields" : {
"_last_updated" : {
"$toDate" : "$_id"
}
}
},
{
$replaceRoot: {
newRoot: "$$ROOT"
}
}
]
db.getCollection(collection).updateMany({}, agg_query, {upsert: true})
(I would have posted this as a comment, but couldn't)
For anyone who lands here trying to update one field using another in the document with the c# driver...
I could not figure out how to use any of the UpdateXXX methods and their associated overloads since they take an UpdateDefinition as an argument.
// we want to set Prop1 to Prop2
class Foo { public string Prop1 { get; set; } public string Prop2 { get; set;} }
void Test()
{
var update = new UpdateDefinitionBuilder<Foo>();
update.Set(x => x.Prop1, <new value; no way to get a hold of the object that I can find>)
}
As a workaround, I found that you can use the RunCommand method on an IMongoDatabase (https://docs.mongodb.com/manual/reference/command/update/#dbcmd.update).
var command = new BsonDocument
{
{ "update", "CollectionToUpdate" },
{ "updates", new BsonArray
{
new BsonDocument
{
// Any filter; here the check is if Prop1 does not exist
{ "q", new BsonDocument{ ["Prop1"] = new BsonDocument("$exists", false) }},
// set it to the value of Prop2
{ "u", new BsonArray { new BsonDocument { ["$set"] = new BsonDocument("Prop1", "$Prop2") }}},
{ "multi", true }
}
}
}
};
database.RunCommand<BsonDocument>(command);
MongoDB 4.2+ Golang
result, err := collection.UpdateMany(ctx, bson.M{},
mongo.Pipeline{
bson.D{{"$set",
bson.M{"name": bson.M{"$concat": []string{"$lastName", " ", "$firstName"}}}
}},
)

How do you sort results of a _View_ by value in the in Couchbase?

So from what I understand in Couchbase is that one can sort keys* by using
descending=true
but in my case I want to sort by values instead. Consider the Twitter data in json format, my question is What it the most popular user mentioned?
Each tweet has the structure of:
{
"text": "",
"entities" : {
"hashtags" : [ ... ],
"user_mentions" : [ ...],
"urls" : [ ... ]
}
So having used MongoDB before I reused the Map function and modified it slightly to be usable in Couchbase as follows:
function (doc, meta) {
if (!doc.entities) { return; }
doc.entities.user_mentions.forEach(
function(mention) {
if (mention.screen_name !== undefined) {
emit(mention.screen_name, null);
}
}
)
}
And then I used the reduce function _count to count all the screen_name occurrences. Now my problem is How do I sort by the count values, rather than the key?
Thanks
The short answer is you cannot sort by value the result of you view. You can only sort by key.
Some work around will be to either:
analyze the data before inserting them into Couchbase and create a counter for the values you are interested by (mentions in your case)
use the view you have to sort on the application size if the size of the view is acceptable for a client side sort.
The following JS code calls a view, sorts the result, and prints the 10 hottest subjects (hashtags):
var http = require('http');
var options = {
host: '127.0.0.1',
port: 8092,
path: '/social/_design/dev_tags/_view/tags?full_set=true&connection_timeout=60000&group=true',
method: 'GET'
}
http.request(
options,
function(res) {
var buf = new Buffer(0);
res.on('data', function(data) {
buf += data;
});
res.on('end', function() {
var tweets = JSON.parse(buf);
var rows = tweets.rows;
rows.sort( function (a,b){ return b.value - a.value }
);
for ( var i = 0; i < 10; i++ ) {
console.log( rows[i] );
}
});
}
).end();
In the same time I am looking at other options to achieve this
I solved this by using a compound key.
function (doc, meta) {
emit([doc.constraint,doc.yoursortvalue]);
}
url elements:
&startkey=["jim",5]&endkey=["jim",10]&descending=true