I upload file to google storage using "#ffmpeg-installer/ffmpeg" and #google-cloud/storage in my node.js App.
Step 1. file uploading to fs is in child processes - one process for each type of resolution (totaly six).
step 2. encription (converting to stream)
step 3. upload to google storage
I use "Upload a directory to a bucket" in order to send the video from the client to the Google Cloud Storage bucket.
This way is working fine only with small video.
for example when I upload video with duration one hour it split on chunk and totally I get more three thousands files. But the problem occurs when there are more than 1500 files
So actually i upload folder with large amount of files, but not all of this files are uploaded to cloud.
maybe someone had the similar problem and helps fix it.
const uploadFolder = async (bucketName, directoryPath, socketInstance) => {
try {
let dirCtr = 1;
let itemCtr = 0;
const fileList = [];
const onComplete = async () => {
const folderName = nanoid(46);
await Promise.all(
fileList.map(filePath => {
const fileName = path.relative(directoryPath, filePath);
const destination = `${ folderName }/${ fileName }`;
return storage
.bucket(bucketName)
.upload(filePath, { destination })
.then(
uploadResp => ({ fileName: destination, status: uploadResp[0] }),
err => ({ fileName: destination, response: err })
);
})
);
if (socketInstance) socketInstance.emit('uploadProgress', {
message: `Added files to Google bucket`,
last: false,
part: false
});
return folderName;
};
const getFiles = async directory => {
const items = await fs.readdir(directory);
dirCtr--;
itemCtr += items.length;
for(const item of items) {
const fullPath = path.join(directory, item);
const stat = await fs.stat(fullPath);
itemCtr--;
if (stat.isFile()) {
fileList.push(fullPath);
} else if (stat.isDirectory()) {
dirCtr++;
await getFiles(fullPath);
}
}
}
await getFiles(directoryPath);
return onComplete();
} catch (e) {
log.error(e.message);
throw new Error('Can\'t store folder.');
}
};
I have a few lambda functions that allow to make a multipart upload to an Amazon S3 bucket. These are responsible for creating the multipart upload, then another one for each part upload and the last one for completing the upload.
First two seem to work fine (they respond with statusCode 200), but the last one fails. On Cloudwatch, I can see an error saying 'Your proposed upload is smaller than the minimum allowed size'.
This is not true, since I'm uploading files bigger than 5Mb minimum size specified on docs. However, I think the issue is happening in every single part upload.
Why? Because each part only has 2Mb of data. On docs, I can see that every but the last part needs to be at least 5Mb sized. However, when I try to upload parts bigger than 2Mb, I get a CORS error, most probably because I have passed the 6Mb lambda payload limit.
Can anyone help me with this? Below I leave my client-side code, just in case you can see any error on it.
setLoading(true);
const file = files[0];
const size = 2000000;
const extension = file.name.substring(file.name.lastIndexOf('.'));
try {
const multiStartResponse = await startMultiPartUpload({ fileType: extension });
console.log(multiStartResponse);
let part = 1;
let parts = [];
/* eslint-disable no-await-in-loop */
for (let start = 0; start < file.size; start += size) {
const chunk = file.slice(start, start + size + 1);
const textChunk = await chunk.text();
const partResponse = await uploadPart({
file: textChunk,
fileKey: multiStartResponse.data.Key,
partNumber: part,
uploadId: multiStartResponse.data.UploadId,
});
console.log(partResponse);
parts.push({ ETag: partResponse.data.ETag, PartNumber: part });
part++;
}
/* eslint-enable no-await-in-loop */
const completeResponse = await completeMultiPartUpload({
fileKey: multiStartResponse.data.Key,
uploadId: multiStartResponse.data.UploadId,
parts,
});
console.log(completeResponse);
} catch (e) {
console.log(e);
} finally {
setLoading(false);
}
It seems that uploading parts via lambda is simply not possible, so we need to use a different approach.
Now, our startMultiPartUpload lambda returns not only an upload ID but also a bunch of signedURLs, generated with S3 aws-sdk class, using getSignedUrlPromise method, and 'uploadPart' as operation, as shown below:
const getSignedPartURL = (bucket, fileKey, uploadId, partNumber) =>
s3.getSignedUrlPromise('uploadPart', { Bucket: bucket, Key: fileKey, UploadId:
uploadId, PartNumber: partNumber })
Also, since uploading a part this way does not return an ETag (or maybe it does, but I just couldn't achieve it), we need to call listParts method on S3 class after uploading each part in order to get those ETags. I'll leave my React code below:
const uploadPart = async (url, data) => {
try {
// return await uploadPartToS3(url, data);
return fetch(url, {
method: 'PUT',
body: data,
}).then((e) => e.body);
} catch (e) {
console.error(e);
throw new Error('Unknown error');
}
};
// If file is bigger than 50Mb then perform a multi part upload
const uploadMultiPart = async ({ name, size, originFileObj },
updateUploadingMedia) => {
// chunk size determines each part size. This needs to be > 5Mb
const chunkSize = 60000000;
let chunkStart = 0;
const extension = name.substring(name.lastIndexOf('.'));
const partsQuan = Math.ceil(size / chunkSize);
// Start multi part upload. This returns both uploadId and signed urls for each
part.
const startResponse = await startMultiPartUpload({
fileType: extension,
chunksQuan: partsQuan,
});
console.log('start response: ', startResponse);
const {
signedURLs,
startUploadResponse: { Key, UploadId },
} = startResponse.data;
try {
let promises = [];
/* eslint-disable no-await-in-loop */
for (let i = 0; i < partsQuan; i++) {
// Split file into parts and upload each one to it's signed url
const chunk = await originFileObj.slice(chunkStart, chunkStart +
chunkSize).arrayBuffer();
chunkStart += chunkSize;
promises.push(uploadPart(signedURLs[i], chunk));
if (promises.length === 5) {
await Promise.all(promises);
promises = [];
}
console.log('UPLOAD PART RESPONSE', uploadResponse);
}
/* eslint-enable no-await-in-loop */
// wait until every part is uploaded
await allProgress({ promises, name }, (media) => {
updateUploadingMedia(media);
});
// Get parts list to build complete request (each upload does not retrieve ETag)
const partsList = await listParts({
fileKey: Key,
uploadId: UploadId,
});
// build parts object for complete upload
const completeParts = partsList.data.Parts.map(({ PartNumber, ETag }) => ({
ETag,
PartNumber,
}));
// Complete multi part upload
completeMultiPartUpload({
fileKey: Key,
uploadId: UploadId,
parts: completeParts,
});
return Key;
} catch (e) {
console.error('ERROR', e);
const abortResponse = await abortUpload({
fileKey: Key,
uploadId: UploadId,
});
console.error(abortResponse);
}
};
Sorry for identation, I corrected it line by line as best as I could :).
Some considerations:
-We use 60Mb chunks because our backend took too long generating all those signed urls for big files.
-Also, this solution is meant to upload really big files, that's why we await every 5 parts.
However, we are stil facing issues to upload huge files (about 35gb) since after uploading 100/120 parts, fetch requests suddenly starts to fail and no more parts are uploaded. If someone knows what's going on, it would be amazing. I publish this as an answer because I think most people will find this very useful.
I'm trying to create a REST API with ExpressJS that accept an image and pass it to another service (with a POST request) which is in charge to perform some operations (resize, etc..) and store into an AWS S3. I know that the same solution can be easily done with a Lambda Function directly but I have a K8s and I want to make worth it.
All components are already working with the exception of the service that forward the image to the second service.
The idea that I've found on internet is using a stream, but I got the exception Error: Expected a stream at Object.getStream [as default]
How can I solve that? Is the right practice or there is a better solution to achieve the same result?
const headers = req.headers;
const files: any = req.files
const filename = files[0].originalname;
const buffer = await getStream(files[0].stream)
const formFile = new FormData();
formFile.append('image', buffer, filename);
headers['Content-Type'] = 'multipart/form-data';
axios.post("http://localhost:1401/content/image/test/upload/", formFile, {
headers: headers,
})
.catch((error) => {
const { status, data } = error.response;
res.status(status).send(data);
})
I've found a solution that I post here for those who'll have the same problem.
Install form-data on node:
yarn add form-data
Then in your controller:
const headers = req.headers;
const files: any = req.files
const formFile = new FormData();
files.forEach((file: any) => {
const filename = file.originalname;
const buffer = file.buffer
formFile.append('image', buffer, filename);
})
// set the correct header otherwise it won't work
headers["content-type"] = `multipart/form-data; boundary=${formFile.getBoundary()}`
// now you can send the image to the second service
axios.post("http://localhost:1401/content/image/test/upload/", formFile, {
headers: headers,
})
.then((r : any) => {
res.sendStatus(r.status).end()
})
.catch((error) => {
const { status, data } = error.response;
res.status(status).send(data);
})
My Function is triggered by a cloud storage event and will load files into a BigQuery table, my issue is that we recieved some .zip files with the same name and the function is attempting to load these files as well and this is causing some issues with the table. I need to make the code only process files that are .csv. Below is the code I have so far:
exports.ToBigQuery = (event, callback) => {
const file = event.data;
const context = event.context;
const BigQuery = require('#google-cloud/bigquery');
const Storage = require('#google-cloud/storage');
const projectId = "gas-ddr";
const datasetId = "gas_ddr";
const bucketName = file.bucket;
const filename = file.name;
const dashOffset = filename.indexOf('-');
const tableId = filename.substring(0, dashOffset);
console.log(`Load ${filename} into ${tableId}.`);
// Instantiates clients
const bigquery = new BigQuery({
projectId: projectId,
});
const storage = Storage({
projectId: projectId,
});
const metadata = {
allowJaggedRows: true,
skipLeadingRows: 1
};
let job;
// Loads data from a Google Cloud Storage file into the table
bigquery
.dataset(datasetId)
.table(tableId)
.load(storage.bucket(bucketName).file(filename),metadata)
.then(results => {
job = results[0];
console.log(`Job ${job.id} started.`);
// Wait for the job to finish
return job;
})
.then(metadata => {
// Check the job's status for errors
const errors = metadata.status.errors;
if (errors && errors.length > 0) {
throw errors;
}
})
.then(() => {
console.log(`Job ${job.id} completed.`);
})
.catch(err => {
console.error('ERROR:', err);
});
callback();
};
This is simply a javascript related question. You can simply extract the extension part of a filename and process files accordingly:
function getExtension(filename) {
var parts = filename.split('.');
return parts[parts.length - 1];
}
if (getExtension(filename) == "csv") {
// Loads data from a Google Cloud Storage file into the table
bigquery
.dataset(datasetId)
...
}
I have a cloud function that currently will take a .csv file that is landing on the cloud storage and will load the file into a Big Query Table. the issue is it is appending to it, I need it to overwrite, I found a way to do this using the command line --replace but not sure how to do it in .json using a cloud function. Below is my current code:
exports.ToBigQuery_Stage = (event, callback) => {
const file = event.data;
const context = event.context;
const BigQuery = require('#google-cloud/bigquery');
const Storage = require('#google-cloud/storage');
const projectId = "gas-ddr";
const datasetId = "gas_ddr_qc_stage";
const bucketName = file.bucket;
const filename = file.name;
// Do not use the ftp_files Bucket to ensure that the bucket does not get crowded.
// Change bucket to gas_ddr_files_staging
// Set the table name (TableId) to the full file name including date,
// this will give each table a new distinct name and we can keep a record of all of the files recieved.
// This may not be the best way to do this... at some point we will need to archive and delete prior records.
const dashOffset = filename.indexOf('-');
const tableId = filename.substring(0, dashOffset) + "_STAGE";
console.log(`Load ${filename} into ${tableId}.`);
// Instantiates clients
const bigquery = new BigQuery({
projectId: projectId,
});
const storage = Storage({
projectId: projectId,
});
const metadata = {
allowJaggedRows: true,
skipLeadingRows: 1
};
let job;
// Loads data from a Google Cloud Storage file into the table
bigquery
.dataset(datasetId)
.table(tableId)
.load(storage.bucket(bucketName).file(filename),metadata)
.then(results => {
job = results[0];
console.log(`Job ${job.id} started.`);
// Wait for the job to finish
return job;
})
.then(metadata => {
// Check the job's status for errors
const errors = metadata.status.errors;
if (errors && errors.length > 0) {
throw errors;
}
})
.then(() => {
console.log(`Job ${job.id} completed.`);
})
.catch(err => {
console.error('ERROR:', err);
});
callback();
};
You can add this into metadata:
const metadata = {
allowJaggedRows: true,
skipLeadingRows: 1,
writeDisposition: 'WRITE_TRUNCATE'
};
You can find more in the documentation.