Upload the file to Google Cloud Storage.
Dataflow is being pre-processed by reading batch data.
The workload is read from Google Cloud Storage (GCS) to process Dataflow and upload it back to GCS.
But after processing the data, I checked the GCS.
#-*- coding: utf-8 -*-
import apache_beam as beam
import csv
import json
import os
import re
from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
def preprocessing(fields):
fields = fields.split(",")
header = "label"
for i in range(0, 784):
header += (",pixel" + str(i))
label_list_str = "["
label_list = []
for i in range(0,10) :
if fields[0] == str(i) :
label_list_str+=str(i)
else :
label_list_str+=("0")
if i!=9 :
label_list_str+=","
label_list_str+="],"
for i in range(1,len(fields)) :
label_list_str+=fields[i]
if i!=len(fields)-1:
label_list_str+=","
yield label_list_str
def run(project, bucket, dataset) :
argv = [
"--project={0}".format(project),
"--job_name=kaggle-dataflow",
"--save_main_session",
"--region=asia-northeast1",
"--staging_location=gs://{0}/kaggle-bucket-v1/".format(bucket),
"--temp_location=gs://{0}/kaggle-bucket-v1/".format(bucket),
"--max_num_workers=8",
"--worker_region=asia-northeast3",
"--worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd",
"--autoscaling_algorithm=THROUGHPUT_BASED",
"--runner=DataflowRunner",
"--worker_region=asia-northeast3"
]
result_output = 'gs://kaggle-bucket-v1/result/result.csv'
filename = "gs://{}/train.csv".format(bucket)
pipeline = beam.Pipeline(argv=argv)
ptransform = (pipeline
| "Read from GCS" >> beam.io.ReadFromText(filename)
| "Kaggle data pre-processing" >> beam.FlatMap(preprocessing)
)
(ptransform
| "events:out" >> beam.io.WriteToText(
result_output
)
)
pipeline.run()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run pipeline on the cloud")
parser.add_argument("--project", dest="project", help="Unique project ID", required=True)
parser.add_argument("--bucket", dest="bucket", help="Bucket where your data were ingested", required=True)
parser.add_argument("--dataset", dest="dataset", help="BigQuery dataset")
args = vars(parser.parse_args())
print("Correcting timestamps and writing to BigQuery dataset {}".format(args["dataset"]))
run(project=args["project"], bucket=args["bucket"], dataset=args["dataset"])
I am trying to do now is to upload a file containing a header to GCS and add data by working with the name of the file. However, it doesn't work well.
How can I include the header in the CSV file?
Related
At the following page
https://googlecloudplatform.github.io/google-cloud-python/latest/storage/blobs.html
there are all the API calls which can be used for Python & Google Cloud storage. Even in the "official" samples on github
https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/storage/cloud-client/snippets.py
don't have a related example.
Finally, downloading a directory with the same method used for download files gives the error
Error: [Errno 21] Is a directory:
You just have to first list all the files in a directory and then download them one by one:
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
filename = blob.name.replace('/', '_')
blob.download_to_filename(dl_dir + filename) # Download
blob.name includes the entire directory structure + filename, so if you want the same file name as in the bucket, you might want to extract it first (instead of replacing / with _)
If you want to keep the same directory structure without renaming and also create nested folders. I have for python 3.5+ a solution based on #ksbg answer :
from pathlib import Path
bucket_name = 'your-bucket-name'
prefix = 'your-bucket-directory/'
dl_dir = 'your-local-directory/'
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
if blob.name.endswith("/"):
continue
file_split = blob.name.split("/")
directory = "/".join(file_split[0:-1])
Path(directory).mkdir(parents=True, exist_ok=True)
blob.download_to_filename(blob.name)
Lets say, we want to download FINALFOLDER from the storage path: gs://TEST_BUCKET_NAME/FOLDER1/FOLDER2/FINALFOLDER
After downloading, the final path will look like: D:\\my_blob_data\FINALFOLDER
from os import makedirs
from os.path import join, isdir, isfile, basename
from google.cloud import storage
# if your environment was authenticated, the default config will be picked up
storage_client = storage.Client() # comment this line if you want to use service account
# uncomment the line below if you have a service account json
# storage_client = storage.Client.from_service_account_json('creds/sa.json')
bucket_name = 'TEST_BUCKET_NAME'
prefix = 'FOLDER2'
dst_path = 'D:\\my_blob_data'
if isdir(dstPath) == False:
makedirs(dstPath)
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = bucket.list_blobs(prefix=prefix) # Get list of files
for blob in blobs:
blob_name = blob.name
dst_file_name = blob_name.replace('FOLDER1/FOLDER2', dst_path) #.replace('FOLDER1/FOLDER2', 'D:\\my_blob_data')
# extract the final directory and create it in the destination path if it does not exist
dst_dir = dst_file_name.replace('/' + basename(dst_file_name), '')
if isdir(dst_dir) == False:
makedirs(dst_dir)
# download the blob object
blob.download_to_filename(dst_file_name)
Using tensoflow gfile package, here is a recursive function.
root_dir is the GCS parent folder.
local_base_dir is the parent folder created at local
def copy_recursively(root_dir, local_base_dir):
if tf.io.gfile.exists(local_base_dir):
tf.io.gfile.rmtree(local_base_dir)
tf.io.gfile.mkdir(local_base_dir)
file_list = tf.io.gfile.glob(root_dir+'/**')
for item in file_list:
if not tf.io.gfile.isdir(item):
fname = item.rsplit('/',1)[-1]
if not fname.startswith('.'):
tf.io.gfile.copy(item,
os.path.join(local_base_dir,fname),
overwrite=False)
else:
child_dir= item.rsplit('/',1)[-1]
full_dir_path = os.path.join(local_base_dir,child_dir)
print(f"Setting up child directory: {full_dir_path}")
copy_recursively(item,full_dir_path)
root_dir = 'gs://.../.../..'
local_base_dir = root_dir.rsplit('/',1)[-1]
copy_recursively(root_dir, local_base_dir)
Refer This Link- https://medium.com/#sandeepsinh/multiple-file-download-form-google-cloud-storage-using-python-and-gcs-api-1dbcab23c44
1 - Add Your Credential Json
2 - List Bucket Items
3 - Download
import logging
import os
from google.cloud import storage
global table_id
global bucket_name
logging.basicConfig(format=’%(levelname)s:%(message)s’, level=logging.DEBUG)
bucket_name = ‘mybucket’
table_id = ‘shakespeare’
storage_client = storage.Client.from_service_account_json(‘/google-cloud/keyfile/service_account.json’)
# The “folder” where the files you want to download are
folder=’/google-cloud/download/{}’.format(table_id)
delimiter=’/’
bucket=storage_client.get_bucket(bucket_name)
blobs=bucket.list_blobs(prefix=table_id, delimiter=delimiter) #List all objects that satisfy the filter.
# Download the file to a destination
def download_to_local():
logging.info(‘File download Started…. Wait for the job to complete.’)
# Create this folder locally if not exists
if not os.path.exists(folder):
os.makedirs(folder)
# Iterating through for loop one by one using API call
for blob in blobs:
logging.info(‘Blobs: {}’.format(blob.name))
destination_uri = ‘{}/{}’.format(folder, blob.name)
blob.download_to_filename(destination_uri)
logging.info(‘Exported {} to {}’.format(
blob.name, destination_uri))
if __name__ == ‘__main__’:
download_to_local()
Dataflow is being pre-processed by reading batch data.
The workload is read from Google Cloud Storage (GCS) to process Dataflow and upload it back to GCS.
But after processing the data, I checked the GCS.
result-001.csv
result-002.csv
result-003.csv
This is how the data is divided and stored.
Can't I combine these files into one?
#-*- coding: utf-8 -*-
import apache_beam as beam
import csv
import json
import os
import re
from apache_beam.io.gcp.bigquery_tools import parse_table_schema_from_json
def preprocessing(fields):
fields = fields.split(",")
header = "label"
for i in range(0, 784):
header += (",pixel" + str(i))
label_list_str = "["
label_list = []
for i in range(0,10) :
if fields[0] == str(i) :
label_list_str+=str(i)
else :
label_list_str+=("0")
if i!=9 :
label_list_str+=","
label_list_str+="],"
for i in range(1,len(fields)) :
label_list_str+=fields[i]
if i!=len(fields)-1:
label_list_str+=","
yield label_list_str
def run(project, bucket, dataset) :
argv = [
"--project={0}".format(project),
"--job_name=kaggle-dataflow",
"--save_main_session",
"--region=asia-northeast1",
"--staging_location=gs://{0}/kaggle-bucket-v1/".format(bucket),
"--temp_location=gs://{0}/kaggle-bucket-v1/".format(bucket),
"--max_num_workers=8",
"--worker_region=asia-northeast3",
"--worker_disk_type=compute.googleapis.com/projects//zones//diskTypes/pd-ssd",
"--autoscaling_algorithm=THROUGHPUT_BASED",
"--runner=DataflowRunner",
"--worker_region=asia-northeast3"
]
result_output = 'gs://kaggle-bucket-v1/result/result.csv'
filename = "gs://{}/train.csv".format(bucket)
pipeline = beam.Pipeline(argv=argv)
ptransform = (pipeline
| "Read from GCS" >> beam.io.ReadFromText(filename)
| "Kaggle data pre-processing" >> beam.FlatMap(preprocessing)
)
(ptransform
| "events:out" >> beam.io.WriteToText(
result_output
)
)
pipeline.run()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Run pipeline on the cloud")
parser.add_argument("--project", dest="project", help="Unique project ID", required=True)
parser.add_argument("--bucket", dest="bucket", help="Bucket where your data were ingested", required=True)
parser.add_argument("--dataset", dest="dataset", help="BigQuery dataset")
args = vars(parser.parse_args())
print("Correcting timestamps and writing to BigQuery dataset {}".format(args["dataset"]))
run(project=args["project"], bucket=args["bucket"], dataset=args["dataset"])
Thank you for reading :)
Method beam.io.WriteToText automatically splits files when writing for best performance. You can explicitly add parameter num_shards = 1 if you want 1 file only.
num_shards (int) – The number of files (shards) used for output. If
not set, the service will decide on the optimal number of shards.
Constraining the number of shards is likely to reduce the performance
of a pipeline. Setting this value is not recommended unless you
require a specific number of output files.
Your writing to text should look like this:
(ptransform
| "events:out" >> beam.io.WriteToText(
result_output,num_shards=1
)
)
I am very new to programming. I am working on a pipeline to analyze DMARC report files that are sent to my email account, that I am manually placing in an s3 bucket. The goal of this task is to download, extract, and analyze files using parsedmarc: https://github.com/domainaware/parsedmarc The part I'm having difficulty with is setting a conditional statement to extract .gz files if the target file is not a .zip file. I'm assuming the gzip library will be sufficient for this purpose. Here is the code I have so far. I'm using python3 and the boto3 library for AWS. Any help is appreciated!
import parsedmarc
import pprint
import json
import boto3
import zipfile
import gzip
pp = pprint.PrettyPrinter(indent=2)
def main():
#Set default session profile and region for sandbox account. Access keys are pulled from /.aws/config and /.aws/credentials.
#The 'profile_name' value comes from the header for the account in question in /.aws/config and /.aws/credentials
boto3.setup_default_session(region_name="aws-region-goes-here")
boto3.setup_default_session(profile_name="aws-account-profile-name-goes-here")
#Define the s3 resource, the bucket name, and the file to download. It's hardcoded for now...
s3_resource = boto3.resource(s3)
s3_resource.Bucket('dmarc-parsing').download_file('source-dmarc-report-filename.zip' '/home/user/dmarc/parseme.zip')
#Use the zipfile python library to extract the file into its raw state.
with zipfile.ZipFile('/home/user/dmarc/parseme.zip', 'r') as zip_ref:
zip_ref.extractall('/home/user/dmarc')
#Ingest all locations for xml file source
dmarc_report_directory = '/home/user/dmarc/'
dmarc_report_file = 'parseme.xml'
"""I need an if statement here for extracting .gz files if the file type is not .zip. The contents of every archive are .xml files"""
#Set report output variables using functions in parsedmarc. Variable set to equal the output
pd_report_output=parsedmarc.parse_aggregate_report_file(_input=f"{dmarc_report_directory}{dmarc_report_file}")
#use jsonify to make the output in json format
pd_report_jsonified = json.loads(json.dumps(pd_report_output))
dkim_status = pd_report_jsonified['records'][0]['policy_evaluated']['dkim']
spf_status = pd_report_jsonified['records'][0]['policy_evaluated']['spf']
if dkim_status == 'fail' or spf_status == 'fail':
print(f"{dmarc_report_file} reports failure. oh crap. report:")
else:
print(f"{dmarc_report_file} passes. great. report:")
pp.pprint(pd_report_jsonified['records'][0]['auth_results'])
if __name__ == "__main__":
main()
Here is the code using the parsedmarc.parse_aggregate_report_xml method I found. Hope this helps others in parsing these reports:
import parsedmarc
import pprint
import json
import boto3
import zipfile
import gzip
pp = pprint.PrettyPrinter(indent=2)
def main():
#Set default session profile and region for account. Access keys are pulled from ~/.aws/config and ~/.aws/credentials.
#The 'profile_name' value comes from the header for the account in question in ~/.aws/config and ~/.aws/credentials
boto3.setup_default_session(profile_name="aws_profile_name_goes_here", region_name="region_goes_here")
source_file = 'filename_in_s3_bucket.zip'
destination_directory = '/tmp/'
destination_file = 'compressed_report_file'
#Define the s3 resource, the bucket name, and the file to download. It's hardcoded for now...
s3_resource = boto3.resource('s3')
s3_resource.Bucket('bucket-name-for-dmarc-report-files').download_file(source_file, f"{destination_directory}{destination_file}")
#Extract xml
outputxml = parsedmarc.extract_xml(f"{destination_directory}{destination_file}")
#run parse dmarc analysis & convert output to json
pd_report_output = parsedmarc.parse_aggregate_report_xml(outputxml)
pd_report_jsonified = json.loads(json.dumps(pd_report_output))
#loop through results and find relevant status info and pass fail status
dmarc_report_status = ''
for record in pd_report_jsonified['records']:
if False in record['alignment'].values():
dmarc_report_status = 'Failed'
#************ add logic for interpreting results
#if fail, publish to sns
if dmarc_report_status == 'Failed':
message = "Your dmarc report failed a least one check. Review the log for details"
sns_resource = boto3.resource('sns')
sns_topic = sns_resource.Topic('arn:aws:sns:us-west-2:112896196555:TestDMARC')
sns_publish_response = sns_topic.publish(Message=message)
if __name__ == "__main__":
main()
I am using Google Cloud Speech-to-Text AP and trying to transcribe long audio file.However the audio file from the bucket cannot be detected.
I get an error stating :IOError: [Errno 2] No such file or directory:
def transcribe_gcs(gcs_uri):
time(gcs_uri)
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16000,
language_code='en-US')
operation = client.long_running_recognize(config, audio)
print('Waiting for operation to complete...')
response = operation.result(timeout=90)
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response.results:
# The first alternative is the most likely one for this portion.
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
print('Confidence: {}'.format(result.alternatives[0].confidence))
Try this
import requests
import json
url = "https://speech.googleapis.com/v1/speech:longrunningrecognize?key=<apiaccesskey>"
payload = {"config": {"encoding": "LINEAR16","sample_rate_hertz": 8000,
"language_code": "en-IN"},
"audio": {"uri": "gs://bucketname/file.flac"}}
r = requests.post(url, data=json.dumps(payload))
json_resp = r.json()
token_resp=json_resp['name']
url = "https://speech.googleapis.com/v1/operations/" + str(token_resp) +
"?key=<apiacesskey>"
content_response = requests.get(url)
content_json = content_response.json()
Your response is in content_json variable.
I am trying to automate the running of and downloading nessus scans using python. I have been using the nessrest api for python, and am able to successfully run a scan, but am not being successfully download the report in nessus format.
Any ideas how I can do this? I have been using the module scan_download, but that actually executes before my scan even finishes.
Thanks for the help in advance!
Just looking back at this question, heres an example of using Nessrest API to pull down CSV report exports from you nessus host,
#!/usr/bin/python2.7
import sys
import os
import io
from nessrest import ness6rest
file_format = 'csv' # options: nessus, csv, db, html
dbpasswd = ''
scan = ness6rest.Scanner(url="https://nessus:8834", login="admin", password="P#ssword123", insecure=True)
scan.action(action='scans', method='get')
folders = scan.res['folders']
scans = scan.res['scans']
if scan:
scan.action(action='scans', method='get')
folders = scan.res['folders']
scans = scan.res['scans']
for f in folders:
if not os.path.exists(f['name']):
if not f['type'] == 'trash':
os.mkdir(f['name'])
for s in scans:
scan.scan_name = s['name']
scan.scan_id = s['id']
folder_name = next(f['name'] for f in folders if f['id'] == s['folder_id'])
folder_type = next(f['type'] for f in folders if f['id'] == s['folder_id'])
# skip trash items
if folder_type == 'trash':
continue
if s['status'] == 'completed':
file_name = '%s_%s.%s' % (scan.scan_name, scan.scan_id, file_format)
file_name = file_name.replace('\\','_')
file_name = file_name.replace('/','_')
file_name = file_name.strip()
relative_path_name = folder_name + '/' + file_name
# PDF not yet supported
# python API wrapper nessrest returns the PDF as a string object instead of a byte object, making writing and correctly encoding the file a chore...
# other formats can be written out in text mode
file_modes = 'wb'
# DB is binary mode
#if args.format == "db":
# file_modes = 'wb'
with io.open(relative_path_name, file_modes) as fp:
if file_format != "db":
fp.write(scan.download_scan(export_format=file_format))
else:
fp.write(scan.download_scan(export_format=file_format, dbpasswd=dbpasswd))
can see more examples here,
https://github.com/tenable/nessrest/tree/master/scripts