I have downloaded an archive of mails from my gmail account. I am using the following python(2.7) code taken from a blog to convert the contents of the archive to csv.
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
writer.writerow([message['subject'], message['from'], message['date']])
I want to include the body of the mail(the actual messages) too...but couldn't figure out how. I have not used python earlier, can someone help please? I have used other SO options given but couldn't get through.
To do the same task, I have used the following code too: but get indentation error for line 60: return json_msg. I have tried different indentation options but nothing improved.
import sys
import mailbox
import email
import quopri
import json
import time
from BeautifulSoup import BeautifulSoup
from dateutil.parser import parse
MBOX = 'Users/mymachine/client1/Takeout/Mail/archive.mbox'
OUT_FILE = 'Users/mymachine/client1/Takeout/Mail/archive.mbox.json'
def cleanContent(msg):
msg = quopri.decodestring(msg)
try:
soup = BeautifulSoup(msg)
except:
return ''
return ''.join(soup.findAll(text=True))
# There's a lot of data to process, and the Pythonic way to do it is with a
# generator. See http://wiki.python.org/moin/Generators.
# Using a generator requires a trivial encoder to be passed to json for object
# serialization.
class Encoder(json.JSONEncoder):
def default(self, o): return list(o)
def gen_json_msgs(mb):
while 1:
msg = mb.next()
if msg is None:
break
yield jsonifyMessage(msg)
def jsonifyMessage(msg):
json_msg = {'parts': []}
for (k, v) in msg.items():
json_msg[k] = v.decode('utf-8', 'ignore')
for k in ['To', 'Cc', 'Bcc']:
if not json_msg.get(k):
continue
json_msg[k] = json_msg[k].replace('\n', '').replace('\t', '').replace('\r', '')\
.replace(' ', '').decode('utf-8', 'ignore').split(',')
for part in msg.walk():
json_part = {}
if part.get_content_maintype() == 'multipart':
continue
json_part['contentType'] = part.get_content_type()
content = part.get_payload(decode=False).decode('utf-8', 'ignore')
json_part['content'] = cleanContent(content)
json_msg['parts'].append(json_part)
then = parse(json_msg['Date'])
millis = int(time.mktime(then.timetuple())*1000 + then.microsecond/1000)
json_msg['Date'] = {'$date' : millis}
return json_msg
mbox = mailbox.UnixMailbox(open(MBOX, 'rb'), email.message_from_file)
f = open(OUT_FILE, 'w')
for msg in gen_json_msgs(mbox):
if msg != None:
f.write(json.dumps(msg, cls=Encoder) + '\n')
f.close()
Try this.
import mailbox
import csv
writer = csv.writer(open(("clean_mail.csv", "wb"))
for message in mailbox.mbox('archive.mbox'):
if message.is_multipart():
content = ''.join(part.get_payload() for part in message.get_payload())
else:
content = message.get_payload()
writer.writerow([message['subject'], message['from'], message['date'],content])
or this:
import mailbox
import csv
def get_message(message):
if not message.is_multipart():
return message.get_payload()
contents = ""
for msg in message.get_payload():
contents = contents + str(msg.get_payload()) + '\n'
return contents
if __name__ == "__main__":
writer = csv.writer(open("clean_mail.csv", "wb"))
for message in mailbox.mbox("archive.mbox"):
contents = get_message(message)
writer.writerow([message["subject"], message["from"], message["date"],contents])
Find the documentation here.
A little improvement of Rahul snippet for multipart content:
import sys
import mailbox
import csv
from email.header import decode_header
infile = sys.argv[1]
outfile = sys.argv[2]
writer = csv.writer(open(outfile, "w"))
def get_content(part):
content = ''
payload = part.get_payload()
if isinstance(payload, str):
content += payload
else:
for part in payload:
content += get_content(part)
return content
writer.writerow(['date', 'from', 'to', 'subject', 'content'])
for index, message in enumerate(mailbox.mbox(infile)):
content = get_content(message)
row = [
message['date'],
message['from'].strip('>').split('<')[-1],
message['to'],
decode_header(message['subject'])[0][0],
content
]
writer.writerow(row)
Related
The issue
I'm trying out great expectations with dagster, as per this guide
My pipeline seems to execute correctly until it reaches this block:
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='dev.data-pipeline-data-storage.data_pipelines.raw_data.sirene_update',
suite_name='suite.data_pipelines.raw_data.sirene_update',
)
if expectation["success"]:
print("Success")
trying to call expectation["success"] results in a
# TypeError: 'SolidDefinition' object is not subscriptable
When I go inside the code of ge_validation_op_factory, there is a _ge_validation_fn that should yield ExpectationResult, but somehow it gets coverted into a SolidDefinition...
Dagster version = 0.15.9;
Great Expectations version = 0.15.44
Code to reproduce the error
In my code, I am trying to interact with an s3 bucket, so it would be a bit tedious to re-create the code for my example but here it is anyway:
In a gx_postprocessing.py
import json
import boto3
import dagster_ge
from dagster import (
op,
graph,
Field,
String,
OpExecutionContext,
)
from typing import List, Dict
#op(
config_schema={
"bucket": Field(
String,
description="s3 bucket name",
),
"path_in_s3": Field(
String,
description="Prefix representing the path to data",
),
"technical_date": Field(
String,
description="date string to fetch data",
),
"file_name": Field(
String,
description="file name that contains the data",
),
}
)
def read_in_json_datafile_from_s3(context: OpExecutionContext):
bucket = context.op_config["bucket"]
path_in_s3 = context.op_config["path_in_s3"]
technical_date = context.op_config["technical_date"]
file_name = context.op_config["file_name"]
object = f"{path_in_s3}/" f"technical_date={technical_date}/" f"{file_name}"
s3 = boto3.resource("s3")
content_object = s3.Object(bucket, object)
file_content = content_object.get()["Body"].read().decode("utf-8")
json_content = json.loads(file_content)
return json_content
#op
def process_example_dq(data: List[Dict]):
return len(data)
#op
def postprocess_example_dq(numrows, expectation):
if expectation["success"]:
return numrows
else:
raise ValueError
#op
def validate_example_dq(context: OpExecutionContext):
expectation = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
return expectation
#graph(
config={
"read_in_json_datafile_from_s3": {
"config": {
"bucket": "my_bucket",
"path_in_s3": "my_path",
"technical_date": "2023-01-24",
"file_name": "myfile_20230124.json",
}
},
},
)
def example_update_evaluation():
output_dict = read_in_json_datafile_from_s3()
nb_items = process_example_dq(data=output_dict)
expectation = validate_example_dq()
postprocess_example_dq(
numrows=nb_items,
expectation=expectation,
)
Do not forget to add great_expectations_poc_pipeline to your __init__.py where the pipelines=[..] are listed.
In this example, dagster_ge.ge_validation_op_factory(...) is returning an OpDefinition, which is the same type of thing as (for example) process_example_dq, and should be composed in the graph definition the same way, rather than invoked within another op.
So instead, you'd want to have something like:
validate_example_dq = dagster_ge.ge_validation_op_factory(
name='ge_validation_op',
datasource_name='my_bucket.data_pipelines.raw_data.example_update',
suite_name='suite.data_pipelines.raw_data.example_update',
)
Then use that op inside your graph definition the same way you currently are (i.e. expectation = validate_example_dq())
I have deployed my Docker image of a Flask app to AWS beanstalk but everytime the environment health becomes severe, with error:
ELB health is failing or not available for all instances.
I have tried to host a sample program of index.html and it worked fine so maybe something is wrong with the code. Please see the below code to help.
FlaskApp[app.py]
from flask import Flask, jsonify, request
from util import prediction
application = Flask(__name__)
#application.route('/predict', methods=['POST'])
def predict():
data = request.get_json()
try:
sample = data['text']
except KeyError:
return jsonify({'error':'No text sent'})
pred = prediction(sample)
try:
result = jsonify(pred)
except TypeError as e:
result = jsonify({'error': str(e)})
return result
if __name__ == '__main__':
application.run(host='0.0.0.0', debug= True)
Util.py
import nltk
import pandas as pd
from nltk import TweetTokenizer
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from nltk.tokenize import TweetTokenizer
from nltk.tag import pos_tag
import re
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import joblib
import warnings
warnings.filterwarnings("ignore")
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
token = TweetTokenizer()
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatize_sentence = []
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatize_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatize_sentence
# print(' '.join(lemmatize_sentence(data[0][0])))
# Data cleaning, getting rid of words not needed for analysis.
stop_words = stopwords.words('english')
def cleaned(token):
if token == 'u':
return 'you'
if token == 'r':
return 'are'
if token == 'some1':
return 'someone'
if token == 'yrs':
return 'years'
if token == 'hrs':
return 'hours'
if token == 'mins':
return 'minutes'
if token == 'secs':
return 'seconds'
if token == 'pls' or token == 'plz':
return 'please'
if token == '2morow':
return 'tomorrow'
if token == '2day':
return 'today'
if token == '4got' or token == '4gotten':
return 'forget'
if token == 'amp' or token == 'quot' or token == 'lt' or token == 'gt':
return ''
return token
# Noise removal from data, removing links, mentions and words with less than 3 length.
def remove_noise(tokens):
cleaned_tokens = []
for token, tag in pos_tag(tokens):
# using non capturing groups ?:)// and eleminating the token if its a link.
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+#]|[!*\(\),]|(?:%[0-9a-fA-F]))+', '', token)
token = re.sub('[^a-zA-Z]', ' ', token)
# eliminating token if its a mention
token = re.sub("(#[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith("VB"):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
cleaned_token = cleaned(token.lower())
# Eliminating if the length of the token is less than 3, if its a punctuation or if it is a stopword.
if cleaned_token not in string.punctuation and len(cleaned_token) > 2 and cleaned_token not in stop_words:
cleaned_tokens.append(cleaned_token)
return cleaned_tokens
with open ('Models/Sentimenttfpipe', 'rb') as f:
loaded_pipeline = joblib.load(f)
def prediction(body):
# loaded_pipeline = joblib.load('Api/Models/Sentimenttfpipe')
text= []
test = token.tokenize(body)
test = remove_noise(test)
text.append(" ".join(test))
test = pd.DataFrame(text, columns=['text'])
a = loaded_pipeline.predict(test['text'].values.astype('U'))
final = []
if a[0] == 0:
final.append({'Label' : 'Relaxed'})
return {'Label' : 'Relaxed'}
if a[0] == 1:
final.append({'Label' : 'Angry'})
return {'Label' : 'Angry'}
if a[0] == 2:
final.append({'Label' : 'Fearful'})
return {'Label' : 'Fearful'}
if a[0] == 3:
final.append({'Label' : 'Happy'})
return {'Label' : 'Happy'}
if a[0] == 4:
final.append({'Label' : 'Sad'})
return {'Label' : 'Sad'}
if a[0] == 5:
final.append({'Label' : 'Surprised'})
return {'Label' : 'Surprised'}
if __name__ == '__main__':
sen = "May the force be with you"
a = prediction(sen)
print(a)
Dockerfile
FROM python:3.10.8
WORKDIR /app
COPY ["requirements.txt", "./"]
RUN pip install -r requirements.txt
RUN python -c "import nltk; nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet'); nltk.download('omw-1.4'); nltk.download('stopwords');"
COPY . .
EXPOSE 5000
ENTRYPOINT [ "gunicorn", "--bind=0.0.0.0:5000", "app:application" ]
docker-compose.yml
version: "3.7"
services:
mlapp:
container_name: Container
image: mlapp
ports:
- "5000:5000"
build:
context: .
dockerfile: Dockerfile
Requirement.txt
Flask>=2.2.2
joblib==1.2.0
nltk==3.7
numpy==1.21.6
pandas==1.5.1
regex==2022.10.31
requests==2.28.1
scikit-learn==1.1.3
gunicorn==20.1.0
I got it to work and all i had to do was delete the docker-compose.yml file.
I don't know how, but it works.
I'm trying to render the Django template after uploading a file (POST method). The issue is: I insert some print() inside the POST request.method and everything is working fine (terminal VSCode) but the template (HTML) is not being displayed inside the render() function.
View.py:
def index(request):
if 'GET' == request.method:
print("It didn't work it!")
print(request.FILES['file'])
return render(request, 'auditoria_app/index.html')
else:
print('It worked it!')
excel_file = request.FILES["file"]
wb = openpyxl.load_workbook(excel_file, data_only=True)
wb.security.workbook_password = 'Rnip*2020'
inputsCombo = wb['Inputs COMBO']
inputsDNCombo = wb['Inputs DN COMBO']
outputCombo = wb['OutPut COMBO']
faixas = wb['Faixas']
wb2 = openpyxl.load_workbook('media/sigma.xlsx', data_only=True)
sigma = wb2['sigma']
sic = wb2['Performance']
# wb4 = openpyxl.load_workbook('media/ultimoContrato.xlsm', data_only=True)
# ultimoContrato = wb4['Base']
numeroIbms = inputsCombo['F5'].value
ibms = []
output = outputResults(numeroIbms, outputCombo)
for i in range(8, 8 + numeroIbms):
ibm = Ibm(i, inputsCombo, inputsDNCombo, outputCombo, faixas, sigma)
ibms.append(ibm)
print(ibm.numeroIbm)
# sigmaReport(sigma, ibm)
return render(request, 'auditoria_app/index.html',
{'ibms': ibms,
'numeroIbms': numeroIbms,
'output': output,
})
Message displayed on VSCode terminal:
It worked it!
Outside: SAE_R360_pagnussat.xlsm
1038108
1049885
[04/Sep/2021 21:18:36] "POST /auditoria/ HTTP/1.1" 200 5132
I am using Cloudwatch subscriptions to send over cloudtrail log of one account into another. The Account receiving the logs has a Kinesis data stream which receives the logs from the cloudwatch subscription and invokes the standard lambda function provided by AWS to parse and store the logs to an S3 bucket of the log receiver account.
The log files getting written to s3 bucket are in the form of :
{"eventVersion":"1.08","userIdentity":{"type":"AssumedRole","principalId":"AA:i-096379450e69ed082","arn":"arn:aws:sts::34502sdsdsd:assumed-role/RDSAccessRole/i-096379450e69ed082","accountId":"34502sdsdsd","accessKeyId":"ASIAVAVKXAXXXXXXXC","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"AROAVAVKXAKDDDDD","arn":"arn:aws:iam::3450291sdsdsd:role/RDSAccessRole","accountId":"345029asasas","userName":"RDSAccessRole"},"webIdFederationData":{},"attributes":{"mfaAuthenticated":"false","creationDate":"2021-04-27T04:38:52Z"},"ec2RoleDelivery":"2.0"}},"eventTime":"2021-04-27T07:24:20Z","eventSource":"ssm.amazonaws.com","eventName":"ListInstanceAssociations","awsRegion":"us-east-1","sourceIPAddress":"188.208.227.188","userAgent":"aws-sdk-go/1.25.41 (go1.13.15; linux; amd64) amazon-ssm-agent/","requestParameters":{"instanceId":"i-096379450e69ed082","maxResults":20},"responseElements":null,"requestID":"a5c63b9d-aaed-4a3c-9b7d-a4f7c6b774ab","eventID":"70de51df-c6df-4a57-8c1e-0ffdeb5ac29d","readOnly":true,"resources":[{"accountId":"34502914asasas","ARN":"arn:aws:ec2:us-east-1:3450291asasas:instance/i-096379450e69ed082"}],"eventType":"AwsApiCall","managementEvent":true,"eventCategory":"Management","recipientAccountId":"345029149342"}
{"eventVersion":"1.08","userIdentity":{"type":"AssumedRole","principalId":"AROAVAVKXAKPKZ25XXXX:AmazonMWAA-airflow","arn":"arn:aws:sts::3450291asasas:assumed-role/dev-1xdcfd/AmazonMWAA-airflow","accountId":"34502asasas","accessKeyId":"ASIAVAVKXAXXXXXXX","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"AROAVAVKXAKPKZXXXXX","arn":"arn:aws:iam::345029asasas:role/service-role/AmazonMWAA-dlp-dev-1xdcfd","accountId":"3450291asasas","userName":"dlp-dev-1xdcfd"},"webIdFederationData":{},"attributes":{"mfaAuthenticated":"false","creationDate":"2021-04-27T07:04:08Z"}},"invokedBy":"airflow.amazonaws.com"},"eventTime":"2021-04-27T07:23:46Z","eventSource":"logs.amazonaws.com","eventName":"CreateLogStream","awsRegion":"us-east-1","sourceIPAddress":"airflow.amazonaws.com","userAgent":"airflow.amazonaws.com","errorCode":"ResourceAlreadyExistsException","errorMessage":"The specified log stream already exists","requestParameters":{"logStreamName":"scheduler.py.log","logGroupName":"dlp-dev-DAGProcessing"},"responseElements":null,"requestID":"40b48ef9-fc4b-4d1a-8fd1-4f2584aff1e9","eventID":"ef608d43-4765-4a3a-9c92-14ef35104697","readOnly":false,"eventType":"AwsApiCall","apiVersion":"20140328","managementEvent":true,"eventCategory":"Management","recipientAccountId":"3450291asasas"}
The problem with this type of log lines is that Athena is not able to Parse these log lines and I am not able to query the logs using Athena.
I tried modifying the blueprint lambda function to save the log file as a standard JSON result which would make it easy for Athena to parse the files.
Eg:
{'Records': ['{"eventVersion":"1.08","userIdentity":{"type":"AssumedRole","principalId":"AROAVAVKXAKPBRW2S3TAF:i-096379450e69ed082","arn":"arn:aws:sts::345029149342:assumed-role/RightslineRDSAccessRole/i-096379450e69ed082","accountId":"345029149342","accessKeyId":"ASIAVAVKXAKPBL653UOC","sessionContext":{"sessionIssuer":{"type":"Role","principalId":"AROAVAVKXAKPXXXXXXX","arn":"arn:aws:iam::34502asasas:role/RDSAccessRole","accountId":"345029asasas","userName":"RDSAccessRole"},"webIdFederationData":{},"attributes":{"mfaAuthenticated":"false","creationDate":"2021-04-27T04:38:52Z"},"ec2RoleDelivery":"2.0"}},"eventTime":"2021-04-27T07:24:20Z","eventSource":"ssm.amazonaws.com","eventName":"ListInstanceAssociations","awsRegion":"us-east-1","sourceIPAddress":"188.208.227.188","userAgent":"aws-sdk-go/1.25.41 (go1.13.15; linux; amd64) amazon-ssm-agent/","requestParameters":{"instanceId":"i-096379450e69ed082","maxResults":20},"responseElements":null,"requestID":"a5c63b9d-aaed-4a3c-9b7d-a4f7c6b774ab","eventID":"70de51df-c6df-4a57-8c1e-0ffdeb5ac29d","readOnly":true,"resources":[{"accountId":"3450291asasas","ARN":"arn:aws:ec2:us-east-1:34502asasas:instance/i-096379450e69ed082"}],"eventType":"AwsApiCall","managementEvent":true,"eventCategory":"Management","recipientAccountId":"345029asasas"}]}
The modified code for Blueprint Lambda function that I looks like:
import base64
import json
import gzip
from io import BytesIO
import boto3
def transformLogEvent(log_event):
return log_event['message'] + '\n'
def processRecords(records):
for r in records:
data = base64.b64decode(r['data'])
striodata = BytesIO(data)
with gzip.GzipFile(fileobj=striodata, mode='r') as f:
data = json.loads(f.read())
recId = r['recordId']
if data['messageType'] == 'CONTROL_MESSAGE':
yield {
'result': 'Dropped',
'recordId': recId
}
elif data['messageType'] == 'DATA_MESSAGE':
result = {}
result["Records"] = {}
events = []
for e in data['logEvents']:
events.append(e["message"])
result["Records"] = events
print(result)
if len(result) <= 6000000:
yield {
'data': result,
'result': 'Ok',
'recordId': recId
}
else:
yield {
'result': 'ProcessingFailed',
'recordId': recId
}
else:
yield {
'result': 'ProcessingFailed',
'recordId': recId
}
def putRecordsToFirehoseStream(streamName, records, client, attemptsMade, maxAttempts):
failedRecords = []
codes = []
errMsg = ''
# if put_record_batch throws for whatever reason, response['xx'] will error out, adding a check for a valid
# response will prevent this
response = None
try:
response = client.put_record_batch(DeliveryStreamName=streamName, Records=records)
except Exception as e:
failedRecords = records
errMsg = str(e)
# if there are no failedRecords (put_record_batch succeeded), iterate over the response to gather results
if not failedRecords and response and response['FailedPutCount'] > 0:
for idx, res in enumerate(response['RequestResponses']):
# (if the result does not have a key 'ErrorCode' OR if it does and is empty) => we do not need to re-ingest
if 'ErrorCode' not in res or not res['ErrorCode']:
continue
codes.append(res['ErrorCode'])
failedRecords.append(records[idx])
errMsg = 'Individual error codes: ' + ','.join(codes)
if len(failedRecords) > 0:
if attemptsMade + 1 < maxAttempts:
print('Some records failed while calling PutRecordBatch to Firehose stream, retrying. %s' % (errMsg))
putRecordsToFirehoseStream(streamName, failedRecords, client, attemptsMade + 1, maxAttempts)
else:
raise RuntimeError('Could not put records after %s attempts. %s' % (str(maxAttempts), errMsg))
def putRecordsToKinesisStream(streamName, records, client, attemptsMade, maxAttempts):
failedRecords = []
codes = []
errMsg = ''
# if put_records throws for whatever reason, response['xx'] will error out, adding a check for a valid
# response will prevent this
response = None
try:
response = client.put_records(StreamName=streamName, Records=records)
except Exception as e:
failedRecords = records
errMsg = str(e)
# if there are no failedRecords (put_record_batch succeeded), iterate over the response to gather results
if not failedRecords and response and response['FailedRecordCount'] > 0:
for idx, res in enumerate(response['Records']):
# (if the result does not have a key 'ErrorCode' OR if it does and is empty) => we do not need to re-ingest
if 'ErrorCode' not in res or not res['ErrorCode']:
continue
codes.append(res['ErrorCode'])
failedRecords.append(records[idx])
errMsg = 'Individual error codes: ' + ','.join(codes)
if len(failedRecords) > 0:
if attemptsMade + 1 < maxAttempts:
print('Some records failed while calling PutRecords to Kinesis stream, retrying. %s' % (errMsg))
putRecordsToKinesisStream(streamName, failedRecords, client, attemptsMade + 1, maxAttempts)
else:
raise RuntimeError('Could not put records after %s attempts. %s' % (str(maxAttempts), errMsg))
def createReingestionRecord(isSas, originalRecord):
if isSas:
return {'data': base64.b64decode(originalRecord['data']), 'partitionKey': originalRecord['kinesisRecordMetadata']['partitionKey']}
else:
return {'data': base64.b64decode(originalRecord['data'])}
def getReingestionRecord(isSas, reIngestionRecord):
if isSas:
return {'Data': reIngestionRecord['data'], 'PartitionKey': reIngestionRecord['partitionKey']}
else:
return {'Data': reIngestionRecord['data']}
def lambda_handler(event, context):
print(event)
isSas = 'sourceKinesisStreamArn' in event
streamARN = event['sourceKinesisStreamArn'] if isSas else event['deliveryStreamArn']
region = streamARN.split(':')[3]
streamName = streamARN.split('/')[1]
records = list(processRecords(event['records']))
projectedSize = 0
dataByRecordId = {rec['recordId']: createReingestionRecord(isSas, rec) for rec in event['records']}
putRecordBatches = []
recordsToReingest = []
totalRecordsToBeReingested = 0
for idx, rec in enumerate(records):
if rec['result'] != 'Ok':
continue
projectedSize += len(rec['data']) + len(rec['recordId'])
# 6000000 instead of 6291456 to leave ample headroom for the stuff we didn't account for
if projectedSize > 6000000:
totalRecordsToBeReingested += 1
recordsToReingest.append(
getReingestionRecord(isSas, dataByRecordId[rec['recordId']])
)
records[idx]['result'] = 'Dropped'
del(records[idx]['data'])
# split out the record batches into multiple groups, 500 records at max per group
if len(recordsToReingest) == 500:
putRecordBatches.append(recordsToReingest)
recordsToReingest = []
if len(recordsToReingest) > 0:
# add the last batch
putRecordBatches.append(recordsToReingest)
# iterate and call putRecordBatch for each group
recordsReingestedSoFar = 0
if len(putRecordBatches) > 0:
client = boto3.client('kinesis', region_name=region) if isSas else boto3.client('firehose', region_name=region)
for recordBatch in putRecordBatches:
if isSas:
putRecordsToKinesisStream(streamName, recordBatch, client, attemptsMade=0, maxAttempts=20)
else:
putRecordsToFirehoseStream(streamName, recordBatch, client, attemptsMade=0, maxAttempts=20)
recordsReingestedSoFar += len(recordBatch)
print('Reingested %d/%d records out of %d' % (recordsReingestedSoFar, totalRecordsToBeReingested, len(event['records'])))
else:
print('No records to be reingested')
return {"records": records}
My end goal is to store the result on S3 as JSON so that it can be queried easily with Athena.
the line where the transformation is happening is:
elif data['messageType'] == 'DATA_MESSAGE':
Any help in this would be greatly appreciated.
I have a tasks that contains dynamic arguments I want to run periodically, how do I pass dynamic elements to the tasks arguments when the task is being called in django celery beat?
Here is the task I want to run periodically:
#task(bind=True)
def generate_export(export_type, xform, export_id=None, options=None):
"""
Create appropriate export object given the export type.
param: export_type
param: xform
params: export_id: ID of export object associated with the request
param: options: additional parameters required for the lookup.
binary_select_multiples: boolean flag
end: end offset
ext: export extension type
dataview_pk: dataview pk
group_delimiter: "/" or "."
query: filter_query for custom queries
remove_group_name: boolean flag
split_select_multiples: boolean flag
index_tag: ('[', ']') or ('_', '_')
show_choice_labels: boolean flag
language: language labels as in the XLSForm/XForm
"""
username = xform.user.username
id_string = xform.id_string
end = options.get("end")
extension = options.get("extension", export_type)
filter_query = options.get("query")
remove_group_name = options.get("remove_group_name", False)
start = options.get("start")
export_type_func_map = {
Export.XLS_EXPORT: 'to_xls_export',
Export.CSV_EXPORT: 'to_flat_csv_export',
Export.DHIS2CSV_EXPORT: 'to_dhis2csv_export',
Export.CSV_ZIP_EXPORT: 'to_zipped_csv',
Export.SAV_ZIP_EXPORT: 'to_zipped_sav',
Export.GOOGLE_SHEETS_EXPORT: 'to_google_sheets',
}
if xform is None:
xform = XForm.objects.get(
user__username__iexact=username, id_string__iexact=id_string)
dataview = None
if options.get("dataview_pk"):
dataview = DataView.objects.get(pk=options.get("dataview_pk"))
records = dataview.query_data(dataview, all_data=True,
filter_query=filter_query)
total_records = dataview.query_data(dataview,
count=True)[0].get('count')
else:
records = query_data(xform, query=filter_query, start=start, end=end)
if filter_query:
total_records = query_data(xform, query=filter_query, start=start,
end=end, count=True)[0].get('count')
else:
total_records = xform.num_of_submissions
if isinstance(records, QuerySet):
records = records.iterator()
export_builder = ExportBuilder()
export_builder.TRUNCATE_GROUP_TITLE = True \
if export_type == Export.SAV_ZIP_EXPORT else remove_group_name
export_builder.GROUP_DELIMITER = options.get(
"group_delimiter", DEFAULT_GROUP_DELIMITER
)
export_builder.SPLIT_SELECT_MULTIPLES = options.get(
"split_select_multiples", True
)
export_builder.BINARY_SELECT_MULTIPLES = options.get(
"binary_select_multiples", False
)
export_builder.INCLUDE_LABELS = options.get('include_labels', False)
export_builder.INCLUDE_LABELS_ONLY = options.get(
'include_labels_only', False
)
export_builder.INCLUDE_HXL = options.get('include_hxl', False)
export_builder.INCLUDE_IMAGES \
= options.get("include_images", settings.EXPORT_WITH_IMAGE_DEFAULT)
export_builder.VALUE_SELECT_MULTIPLES = options.get(
'value_select_multiples', False)
export_builder.REPEAT_INDEX_TAGS = options.get(
"repeat_index_tags", DEFAULT_INDEX_TAGS
)
export_builder.SHOW_CHOICE_LABELS = options.get('show_choice_labels',
False)
export_builder.language = options.get('language')
# 'win_excel_utf8' is only relevant for CSV exports
if 'win_excel_utf8' in options and export_type != Export.CSV_EXPORT:
del options['win_excel_utf8']
export_builder.set_survey(xform.survey, xform)
# change the dhis2csv exports to standard csv format
if extension == 'dhis2csv':
extension = 'csv'
temp_file = NamedTemporaryFile(suffix=("." + extension))
columns_with_hxl = export_builder.INCLUDE_HXL and get_columns_with_hxl(
xform.survey_elements)
# get the export function by export type
func = getattr(export_builder, export_type_func_map[export_type])
try:
func.__call__(
temp_file.name, records, username, id_string, filter_query,
start=start, end=end, dataview=dataview, xform=xform,
options=options, columns_with_hxl=columns_with_hxl,
total_records=total_records
)
except NoRecordsFoundError:
pass
except SPSSIOError as e:
export = get_or_create_export(export_id, xform, export_type, options)
export.error_message = str(e)
export.internal_status = Export.FAILED
export.save()
report_exception("SAV Export Failure", e, sys.exc_info())
return export
# generate filename
basename = "%s_%s" % (
id_string, datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f"))
if remove_group_name:
# add 'remove group name' flag to filename
basename = "{}-{}".format(basename, GROUPNAME_REMOVED_FLAG)
if dataview:
basename = "{}-{}".format(basename, DATAVIEW_EXPORT)
filename = basename + "." + extension
# check filename is unique
while not Export.is_filename_unique(xform, filename):
filename = increment_index_in_filename(filename)
file_path = os.path.join(
username,
'exports',
id_string,
export_type,
filename)
# seek to the beginning as required by storage classes
temp_file.seek(0)
export_filename = default_storage.save(file_path,
File(temp_file, file_path))
temp_file.close()
dir_name, basename = os.path.split(export_filename)
# get or create export object
export = get_or_create_export(export_id, xform, export_type, options)
export.filedir = dir_name
export.filename = basename
export.internal_status = Export.SUCCESSFUL
# do not persist exports that have a filter
# Get URL of the exported sheet.
if export_type == Export.GOOGLE_SHEETS_EXPORT:
export.export_url = export_builder.url
# if we should create a new export is true, we should not save it
if start is None and end is None:
export.save()
return export
and this is where I call the tasks in the celery beat schedule:
CELERY_BEAT_SCHEDULE = {
'download_csv': {
'task': 'onadata.libs.utils.export_tools.generate_export',
# There are 4 ways we can handle time, read further
'schedule': crontab(minute='*'),
# If you're using any arguments
'args': ()
}
}
how do I pass parameters into the arguments for the tasks??
There is no way to pass argument dynamically in Celery Beat. I think your function is not suitable with periodic task.
Instead of giving a factor directly to the generate_export function, it must be changed to get the required items within the function. Or change to a simple asynchronous operation.
I faced a similar problem. The args field in beat_schedule is fixed at startup and does not change afterward.
But there is a hackish way to pass different arguments to your task.
Use the before_task_publish signal to add custom data in headers.
from celery.signals import before_task_publish
#before_task_publish.connect
def before_publish(sender=None, headers=None, body=None, **kwargs):
if sender == "tasks.generate_export":
headers["custom_args"] = {
"export_type": "some_val"
"xform": "some_val"
"export_id": get_export_id()
"options": options_dict
}
By default, Celery uses JSON serializer. So, make sure the data you add to headers are JSON serializable. Alternatively, you can use pickle to serialize the data, but it brings security concerns with it.
Now you can access these headers in a bound task.
#task(bind=True)
def generate_export(self):
args = self.request.get("custom_args", None)
# do something with args