Trigger a cloud function whenever a new file is uploaded to cloud storage bucket. This function should call a dataproc job written in pyspark to read the file and load it to BigQuery.
I want to know how to call a google dataproc job from cloud function. Please suggest.
I was able to create a simple Cloud Function that triggers Dataproc Job on GCS create file event. In this example, the file in GCS contains a Pig query to execute. However you can follow Dataproc API documentation to create a PySpark version.
index.js:
exports.submitJob = (event, callback) => {
const google = require('googleapis');
const projectId = 'my-project'
const clusterName = 'my-cluster'
const file = event.data;
if (file.name) {
google.auth.getApplicationDefault(function (err, authClient, projectId) {
if (err) {
throw err;
}
const queryFileUri = "gs://" + file.bucket + "/" + file.name
console.log("Using queryFileUri: ", queryFileUri);
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped([
'https://www.googleapis.com/auth/cloud-platform',
'https://www.googleapis.com/auth/userinfo.email'
]);
}
const dataproc = google.dataproc({ version: 'v1beta2', auth: authClient });
dataproc.projects.regions.jobs.submit({
projectId: projectId,
region: "global",
resource: {
"job": {
"placement": {"clusterName": clusterName},
"pigJob": {
"queryFileUri": queryFileUri,
}
}
}
}, function(err, response) {
if (err) {
console.error("Error submitting job: ", err);
}
console.log("Dataproc response: ", response);
callback();
});
});
} else {
throw "Skipped processing file!";
}
callback();
};
Make sure to set Function to execute to submitJob.
package.json:
{
"name": "sample-cloud-storage",
"version": "0.0.1",
"dependencies":{ "googleapis": "^21.3.0" }
}
The following blogpost gave me many ideas how to get started:
https://cloud.google.com/blog/big-data/2016/04/scheduling-dataflow-pipelines-using-app-engine-cron-service-or-cloud-functions
Related
I have an app with an Amplify backend and I am getting an error when calling my Lambda:
"Error: Cannot find module 'stripe'\nRequire stack:\n- /var/task/index.js\n- /var/runtime/UserFunction.js\n- /var/runtime/index.js"
I have installed stripe and see it in my dependencies from the main folder but not in the dependencies for my function. I tried to install stripe in my functions folder with no change. I am using React 17.0.2. The stripe version is 9.6.0.
I found a post with a similar problem but they were already able to install stripe: AWS Lambda Error: Cannot find module 'stripe' Require stack
My Lambda:
const aws = require('aws-sdk');
const ddb = new aws.DynamoDB({apiVersion: '2012-10-08'});
const stripe = require('stripe')('secret key');
/**
* #type {import('#types/aws-lambda').APIGatewayProxyHandler}
*/
exports.handler = async (event) => {
try {
const tableName = process.env.tableName
const {username, email} = event.arguments.input
const account = await stripe.accounts.create({
type: 'express',
email: `${email}`,
metadata: {user: `${username}`}
});
console.log("Account creation response: ", account)
console.log("Account id: ", account.id)
// store the Stripe account id in DBB
let ddbParams = {
Item: {
'stripe_id': `${account.id}`
},
TableName: tableName
}
try {
await ddb.putItem(ddbParams).promise()
console.log("Successfully updated stripe_id field")
} catch (err) {
console.log("Storing to DB error: ", err)
}
const accountId = account.id
const accountLink = await stripe.accountLinks.create({
account: accountId,
//Swap for live website
refresh_url: 'http://localhost:3000/profile',
return_url: 'http://localhost:3000/listingform',
type: 'account_onboarding',
});
console.log('Account link response :', accountLink)
return accountLink
} catch (err) {
throw new Error(err)
}
};
My package.json in my functions folder:
{
"name": "createStripeConnectAccount",
"version": "2.0.0",
"description": "Lambda function generated by Amplify",
"main": "index.js",
"license": "Apache-2.0",
"devDependencies": {
"#types/aws-lambda": "^8.10.92"
}
}
UDPATE:
I managed to install stripe and have it as a dependency but I still get the same error.
My package.json now looks like this:
{
"name": "createStripeConnectAccount",
"version": "2.0.0",
"description": "Lambda function generated by Amplify",
"main": "index.js",
"license": "Apache-2.0",
"devDependencies": {
"#types/aws-lambda": "^8.10.92"
},
"dependencies": {
"save": "^2.5.0",
"stripe": "^9.6.0"
}
}
The solution is to first install Stripe in the src folder of the function. This didn't show up in the external terminal I had opened so I had to right click on the src folder and open a terminal in there. Then run npm install.
Keeping this question up for any other new developers or people new to AWS.
I am trying to update a thing's shadow on AWS IoT Core by calling the function 'UpdateThingShadowCommand' from my vueJS web app.
I am following instructions from the documentation here:
https://docs.aws.amazon.com/AWSJavaScriptSDK/v3/latest/clients/client-iot-data-plane/classes/updatethingshadowcommand.html
However, when I execute the method 'UpdateThingShadowCommand', I keep running into the following error message:
net::ERR_CERT_AUTHORITY_INVALID
And another log message:
TypeError: Failed to fetch
My code is as follows:
import { IoTDataPlaneClient } from "#aws-sdk/client-iot-data-plane";
import { UpdateThingShadowCommand } from "#aws-sdk/client-iot-data-plane";
myMethod () {
const configIotDataPlaneClient = {
apiVersion: 'XXXXXXXX',
region: 'XXXXXXX',
credentials: {
accessKeyId: 'XXXXXXXXXXXXXXXX',
secretAccessKey: 'XXXXXXXXXXXXXXXXX'
}
//Initializing the client
const clientShadow = new IoTDataPlaneClient(configIotDataPlaneClient);
console.log(clientShadow)
const inputShadow = {
payload: new Uint8Array(
Buffer.from(
JSON.stringify({
"state": {
"reported": {
"item1": "val1",
"item2": "val2"
}
}
}),
),
),
//shadowName: "",
thingName: "thing-name"
}
//Updating a thing's shadow
try {
const commandShadow = new UpdateThingShadowCommand(inputShadow);
console.log(commandShadow)
const responseShadow = await clientShadow.send(commandShadow);
console.log("Update Shadow response", responseShadow)
}
catch (error) {
console.log("Update Shadow error: ", error)
}
finally{
console.log("Update Shadow: finally method")
}
}
Can anyone suggest why I may be getting these errors? Any help is much appreciated!
I am trying to write a data transformation for an Aurora Postgres data stream. The identity transformation gives me objects like this:
{
"type": "DatabaseActivityMonitoringRecords",
"version": "1.0",
"databaseActivityEvents": "AYADeLZrKReAa/7tgBmd/d06ybQAXwA...BABVoaWV1Fi8LyA==",
"key": "AADDDVAS ...pztPgaw=="
}
So it appears to me that I need to decrypt these. I have the arn of the key attached to the database in question but I cannot get the data transformation to work. Here is what I have so far:
console.log('Loading function');
const zlib = require('zlib');
const AWS = require('aws-sdk');
exports.handler = async (event, context) => {
/* Process the list of records and transform them */
const output = event.records.map((record) => {
const keyId = "arn:aws:kms:us-east-1:501..89:key/1...d7"; // I don't need this?
const CiphertextBlob = record.data;
const kmsClient = new AWS.KMS({region: 'us-east-1'});
const pt = kmsClient.decrypt({ CiphertextBlob }, (err, data) => {
if (err) console.log(err, err.stack); // an error occurred
else {
const { Plaintext } = data;
console.log(data);
return Plaintext;
}
});
return { old_data : record.data,};
});
console.log(`Processing completed. Successful records ${output.length}.`);
return { records: output };
};
This gives me the baffling error "errorMessage": "Error: Unable to stringify response body" I am following the example outlined here: https://docs.aws.amazon.com/kms/latest/developerguide/programming-encryption.html#decryption
Any idea what I am doing wrong here?
I followed following steps while trying to run android app test via AWS Lambda Node.JS
Created a project
Created an upload
Uploaded APK to signed url
Once upload was done I created device pool using following params
var createDevicePoolParams = {
name: "DAP_Device_Pool",
description: "DAP_Android_Devices",
projectArn: projectARN,
rules: [{
attribute: "PLATFORM",
operator: "EQUALS",
value: "\"ANDROID\""
}]
};
Then I called schedulerun with following params
var scheduleRunParams = {
appArn: uploadARN,
name: "tarunRun",
devicePoolArn: devicePoolARN,
projectArn: projectARN,
test: {
type: "BUILTIN_FUZZ",
}
};
But I am getting error of missing or unprocessed resources.
I am not able to understand what I am missing. My understanding is that If I am using built in fuzz testing type then I dont need to upload any custom testcases.
Can somebody pls help pointing out what step is missing
Then
After your uploads have been processed by Device Farm, call aws devicefarm schedule-run
[update]
I put this code in a AWS Lambda function and it worked there as well. Here is a gist of it:
https://gist.github.com/jamesknowsbest/3ea0e385988b0098e5f9d38bf5a932b6
Here is the code I just authored and it seems to work with the Built-inFuzz/Explorer tests
// assume we already executed `npm install aws-sdk`
var AWS = require('aws-sdk');
// assumes `npm install https`
const request = require("request");
// assumes `npm install fs`
const fs = require('fs');
// https://stackoverflow.com/a/41641607/8016330
const sleep = (waitTimeInMs) => new Promise(resolve => setTimeout(resolve, waitTimeInMs));
// Device Farm is only available in the us-west-2 region
var devicefarm = new AWS.DeviceFarm({ region: 'us-west-2' });
(async function() {
let project_params = {
name: "test of fuzz tests"
};
let PROJECT_ARN = await devicefarm.createProject(project_params).promise().then(
function(data){
return data.project.arn;
},
function (error) {
console.error("Error creating project", "Error: ", error);
}
);
console.log("Project created ", "Project arn: ", PROJECT_ARN);
// create the upload and upload files to the project
let params = {
name: "app-debug.apk",
type: "ANDROID_APP",
projectArn: PROJECT_ARN
};
let UPLOAD = await devicefarm.createUpload(params).promise().then(
function(data){
return data.upload;
},
function(error){
console.error("Creating upload failed with error: ", error);
}
);
let UPLOAD_ARN = UPLOAD.arn;
let UPLOAD_URL = UPLOAD.url;
console.log("upload created with arn: ", UPLOAD_ARN);
console.log("uploading file...");
let options = {
method: 'PUT',
url: UPLOAD_URL,
headers: {},
body: fs.readFileSync("/path/to/your/apk/file")
};
// wait for upload to finish
await new Promise(function(resolve,reject){
request(options, function (error, response, body) {
if (error) {
console.error("uploading file failed with error: ", error);
reject(error);
}
resolve(body);
});
});
//get the status of the upload and make sure if finished processing before scheduling
let STATUS = await getStatus(UPLOAD_ARN);
console.log("upload status is: ", STATUS);
while(STATUS !== "SUCCEEDED"){
await sleep(5000);
STATUS = await getStatus(UPLOAD_ARN);
console.log("upload status is: ", STATUS);
}
//create device pool
let device_pool_params = {
projectArn: PROJECT_ARN,
name: "Google Pixel 2",
rules: [{"attribute": "ARN","operator":"IN","value":"[\"arn:aws:devicefarm:us-west-2::device:5F20BBED05F74D6288D51236B0FB9895\"]"}]
}
let DEVICE_POOL_ARN = await devicefarm.createDevicePool(device_pool_params).promise().then(
function(data){
return data.devicePool.arn;
},function(error){
console.error("device pool failed to create with error: ",error);
}
);
console.log("Device pool created successfully with arn: ", DEVICE_POOL_ARN);
//schedule the run
let schedule_run_params = {
name: "MyRun",
devicePoolArn: DEVICE_POOL_ARN, // You can get the Amazon Resource Name (ARN) of the device pool by using the list-pools CLI command.
projectArn: PROJECT_ARN, // You can get the Amazon Resource Name (ARN) of the project by using the list-projects CLI command.
test: {
type: "BUILTIN_FUZZ"
},
appArn: UPLOAD_ARN
};
let schedule_run_result = await devicefarm.scheduleRun(schedule_run_params).promise().then(
function(data){
return data.run;
},function(error){
console.error("Schedule run command failed with error: ", error);
}
);
console.log("run finished successfully with result: ", schedule_run_result);
})();
async function getStatus(UPLOAD_ARN){
return await devicefarm.getUpload({arn: UPLOAD_ARN}).promise().then(
function(data){
return data.upload.status;
},function(error){
console.error("getting upload failed with error: ", error);
}
);
}
Ouput is:
Project created Project arn: arn:aws:devicefarm:us-west-2:111122223333:project:b9233b49-967e-4b09-a51a-b5c4101340a1
upload created with arn: arn:aws:devicefarm:us-west-2:111122223333:upload:b9233b49-967e-4b09-a51a-b5c4101340a1/48ffd115-f7d7-4df5-ae96-4a44911bff65
uploading file...
upload status is: INITIALIZED
upload status is: SUCCEEDED
Device pool created successfully with arn: arn:aws:devicefarm:us-west-2:111122223333:devicepool:b9233b49-967e-4b09-a51a-b5c4101340a1/c0ce1bbc-7b40-4a0f-a419-ab024a6b1000
run finished successfully with result: { arn:
'arn:aws:devicefarm:us-west-2:111122223333:run:b9233b49-967e-4b09-a51a-b5c4101340a1/39369894-3829-4e14-81c9-bdfa02c7e032',
name: 'MyRun',
type: 'BUILTIN_FUZZ',
platform: 'ANDROID_APP',
created: 2019-06-06T23:51:13.529Z,
status: 'SCHEDULING',
result: 'PENDING',
started: 2019-06-06T23:51:13.529Z,
counters:
{ total: 0,
passed: 0,
failed: 0,
warned: 0,
errored: 0,
stopped: 0,
skipped: 0 },
totalJobs: 1,
completedJobs: 0,
billingMethod: 'METERED',
seed: 982045377,
appUpload:
'arn:aws:devicefarm:us-west-2:111122223333:upload:b9233b49-967e-4b09-a51a-b5c4101340a1/48ffd115-f7d7-4df5-ae96-4a44911bff65',
eventCount: 6000,
jobTimeoutMinutes: 150,
devicePoolArn:
'arn:aws:devicefarm:us-west-2:111122223333:devicepool:b9233b49-967e-4b09-a51a-b5c4101340a1/c0ce1bbc-7b40-4a0f-a419-ab024a6b1000',
radios: { wifi: true, bluetooth: false, nfc: true, gps: true } }
HTH
-James
How do I launch a Cloud Dataflow job from a Google Cloud Function? I'd like to use Google Cloud Functions as a mechanism to enable cross-service composition.
I've included a very basic example of the WordCount sample below. Please note that you'll need to include a copy of the java binary in your Cloud Function deployment, since it is not in the default environment. Likewise, you'll need to package your deploy jar with your Cloud Function as well.
module.exports = {
wordcount: function (context, data) {
const spawn = require('child_process').spawn;
const child = spawn(
'jre1.8.0_73/bin/java',
['-cp',
'MY_JAR.jar',
'com.google.cloud.dataflow.examples.WordCount',
'--jobName=fromACloudFunction',
'--project=MY_PROJECT',
'--runner=BlockingDataflowPipelineRunner',
'--stagingLocation=gs://STAGING_LOCATION',
'--inputFile=gs://dataflow-samples/shakespeare/*',
'--output=gs://OUTPUT_LOCATION'
],
{ cwd: __dirname });
child.stdout.on('data', function(data) {
console.log('stdout: ' + data);
});
child.stderr.on('data', function(data) {
console.log('error: ' + data);
});
child.on('close', function(code) {
console.log('closing code: ' + code);
});
context.success();
}
}
You could further enhance this example by using the non-blocking runner and having the function return the Job ID, so that you can poll for job completion separately. This pattern should be valid for other SDKs as well, so long as their dependencies can be packaged into the Cloud Function.
The best way is to launch is via cloud function but be careful, if you are using the cloud function for google cloud storage, then for every file uploaded a dataflow job will be launched.
const { google } = require('googleapis');
const templatePath = "gs://template_dir/df_template;
const project = "<project_id>";
const tempLoc = "gs://tempLocation/";
exports.PMKafka = (data, context, callback) => {
const file = data;
console.log(`Event ${context.eventId}`);
console.log(`Event Type: ${context.eventType}`);
console.log(`Bucket Name: ${file.bucket}`);
console.log(`File Name: ${file.name}`);
console.log(`Metageneration: ${file.metageneration}`);
console.log(`Created: ${file.timeCreated}`);
console.log(`Updated: ${file.updated}`);
console.log(`Uploaded File Name - gs://${file.bucket}/${file.name}`);
google.auth.getApplicationDefault(function (err, authClient, projectId) {
if (err) {
throw err;
}
if (authClient.createScopedRequired && authClient.createScopedRequired()) {
authClient = authClient.createScoped(authScope);
}
const dataflow = google.dataflow({ version: 'v1b3', auth: authClient });
var inputDict= {
inputFile: `gs://${file.bucket}/${file.name}`,
...
...
<other_runtime_parameters>
};
var env = {
tempLocation: tempLoc
};
var resource_opts = {
parameters: inputDict,
environment: env,
jobName: config.jobNamePrefix + "-" + new Date().toISOString().toLowerCase().replace(":","-").replace(".","-")
};
var opts = {
gcsPath: templatePath,
projectId: project,
resource: resource_opts
}
console.log(`Dataflow Run Time Options - ${JSON.stringify(opts)}`)
dataflow.projects.templates.launch(opts, function (err, response) {
if (err) {
console.error("problem running dataflow template, error was: ", err);
slack.publishMessage(null, null, false, err);
return;
}
console.log("Dataflow template response: ", response);
var jobid = response["data"]["job"]["id"];
console.log("Dataflow Job ID: ", jobid);
});
callback();
});
};