I am importing data to QuestDB with the HTTP method in Python.
The table (test_data) has the following properties:
'name': 'time', 'size': 8, 'type': 'TIMESTAMP'
'name': 'open', 'size': 8, 'type': 'DOUBLE'
'name': 'high', 'size': 8, 'type': 'DOUBLE'
'name': 'low', 'size': 8, 'type': 'DOUBLE'
'name': 'close', 'size': 8, 'type': 'DOUBLE'
'name': 'volume', 'size': 8, 'type': 'DOUBLE'
'name': 'ts', 'size': 8, 'type': 'TIMESTAMP'
Note: the 'time' column is the designated timestamp
The imported data is sourced from a pandas dataframe. The dataframe has the same headers and the 'time' column is the index and the 'ts' column is the timestamp from when the data was acquired. The code shown below regarding the import function.
def to_csv_str(table):
output = io.StringIO()
csv.writer(output, dialect="excel").writerows(table)
return output.getvalue().encode("utf-8")
def write_to_table(df, table="test_data"):
table_name = table
table = [[df.index.name] + df.columns.tolist()] +df.reset_index().values.tolist()
table_csv = to_csv_str(table)
schema = json.dumps([])
response = requests.post(
"http://localhost:9000/imp",
params={"fmt": "json"},
files={"schema": schema, "data": (table_name, table_csv)},
).json()
pprint.pprint(response)
The import executes successfully the first time. If I was to rerun the import for the same data (all values are the same except the 'ts' column for when the data was acquired), then one additional row will be appended with all of the same values but the 'ts' column. How can I have the 'time' column be defined in such a way that it is forced to be unique and any import with a row that has a duplicate 'time' value will be omitted?
Example screenshots for a 6 row import below:
Initial import with all rows successfull image 1
Reissued import with only 5 errors (expected 6) image 2
Table data from the web console
image 3
Related
I am new to Dataflow and got stuck with the below issue.
Problem statement: Need a Dataflow job(Python) to load XML from GCS into Bigquery (Batch Load). The Destination table in Bigquery is dynamic and calculated at the run time based on the XML file name.
Solution Decided: Just followed the article - https://medium.com/google-cloud/how-to-load-xml-data-into-bigquery-using-python-dataflow-fd1580e4af48. Wherein there static table was used in BIGQUERYWRITE transform, but I am using dynamic table name obtained via a callable function.(Attaching the code reference)
JOB Graph:
Code:
import apache_beam as beam
import argparse
from apache_beam.options.pipeline_options import PipelineOptions
from google.cloud import storage
# List the gcs file objects
# ToDo: Use Apache Beam GCSIO Match Patterns To list the file objects
storage_client = storage.Client()
bucket_name = "xmltobq"
bucket=storage_client.get_bucket(bucket_name)
blobs = list(bucket.list_blobs(prefix="xmlfiles/"))
blob_files = [blob.name for blob in blobs if ".xml" in blob.name]
#Static schema
table_schema = {
"fields": [
{'name' : 'filename', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'CustomerID', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'EmployeeID', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'OrderDate', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'RequiredDate', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipInfo', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [
{'name' : 'ShipVia', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'Freight', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipName', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipAddress', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipCity', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipRegion', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipPostalCode', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipCountry', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShippedDate', 'type': 'STRING', 'mode': 'NULLABLE'},
]},
]
}
def run(argv=None, save_main_session=True):
parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(argv)
pipeline_options = PipelineOptions(pipeline_args)
with beam.Pipeline(options=pipeline_options) as p:
def readfiles(element):
'''
Input Pcollection: GCS Element Path
Output Pcollection: (XML, Filename)
'''
# Simple XML conversion using XMLTODICT package
# ToDo: Once specific XML paths are acquired, we can parse only the required fields
import xmltodict
import apache_beam as beam
gcs_file = beam.io.filesystems.FileSystems.open("gs://xmltobq/"+element)
parsed_xml = xmltodict.parse(gcs_file)
return parsed_xml, element.split("/")[1].split(".")[0]
def xmlformatting(element):
'''
Input Pcollection: XML
Output Pcollection: A generator of Modified XML Elements
'''
data, filename = element
for order in data['Root']['Orders']['Order']:
yield formatting(order, filename)
#def tablename(e):
# import re
# return "gcp-bq-2021:dataset4." + re.sub("[\s+,(,)]", "", a)
def formatting(order, filename):
'''
Input Pcollection: (XMLELEMENT, Filename)
Output PCollection: Modified XML
ToDo: This is just to handle the sample xml, production code will be havin different
formatting procress'''
import copy
import re
order_copy = copy.deepcopy(order)
if "#ShippedDate" in order['ShipInfo']:
order_copy['ShipInfo']['ShippedDate'] = order['ShipInfo']['#ShippedDate']
del order_copy['ShipInfo']['#ShippedDate']
order_copy['filename'] = "gcp-bq-2021:testdataset."+re.sub("[\s+,(,)]", "", filename)
return order_copy
# Dynamic table name same as that of input xml file by adding file name as key in the dictionary and accessing
# them in writetobigquery
# ToDo: In Production code dynamic schema option will be included in the Writetobq transform
pipeline_data = (p | "Create GCS Object List" >> beam.Create(blob_files) |
"XMLConversion" >> beam.Map(readfiles) |
"XMLformatting" >> beam.FlatMap(xmlformatting) | "shuffle" >> beam.Reshuffle() |
beam.io.WriteToBigQuery(table=lambda row: row['filename'],
# A lambda function to return dynamic table name,
schema=table_schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, #WRITE_TRUNCATE
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
custom_gcs_temp_location="gs://xmltobq"))
Sample XML File:
ISSUE:
I am able to run the job successfully in the dataflow runner and move the files to Bigquery. But the time it is consuming in the WRITETOBIGQUERY is too long, especially in ParDo(TriggerCopyJobs) step where
the throughput is almost below 1element/second
Instead of a dynamic table if it is a single table the job gets completed lightning fast.
Is there is anything wrong I am doing that is preventing parallel processing.
Machine type used: n1-highcpu-8.
JobID: 2022-03-09_07_28_47-10567887862012507747
I have the following query:
result = data.values('collaborator').annotate(amount=Count('cc'))
top = result.order_by('-amount')[:3]
This one, get the collaborator field from data, data is a Django Queryset, i am trying to make like a GROUP BY query, and it's functional, but when i call the .values() method on the top variable, it's returning all the models instances as dicts into a queryset, i need the annotate method result as a list of dicts:
The following is the top variable content on shell:
<QuerySet [{'collaborator': '1092788966', 'amount': 20}, {'collaborator': '1083692812', 'amount': 20}, {'collaborator': '1083572767', 'amount': 20}]>
But when i make list(top.values()) i get the following result:
[{'name': 'Alyse Caffin', 'cc': '1043346592', 'location': 'Wu’an', 'gender': 'MASCULINO', 'voting_place': 'Corporación Educativa American School Barranquilla', 'table_number': '6', 'status': 'ESPERADO', 'amount': 1}, {'name': 'Barthel Hanlin', 'cc': '1043238706', 'location': 'General Santos', 'gender': 'MASCULINO', 'voting_place': 'Colegio San José – Compañía de Jesús Barranquilla', 'table_number': '10', 'status': 'PENDIENTE', 'amount': 1}, {'name': 'Harv Gertz', 'cc': '1043550513', 'location': 'Makueni', 'gender': 'FEMENINO', 'voting_place': 'Corporación Educativa American School Barranquilla', 'table_number': '7', 'status': 'ESPERADO', 'amount': 1}]
I just want the result to be like:
[{'collaborator': '1092788966', 'amount': 20}, {'collaborator': '1083692812', 'amount': 20}, {'collaborator': '1083572767', 'amount': 20}]
there is something wrong, maybe a typo (also it seems you do not show the full query... something like data=yourmodel.objects.filter... is missing before):
The output of list(top.values()) returns a completely different model's fields then what you post as top Queryset- are you sure you really did:
result = data.values('collaborator').annotate(amount=Count('cc'))
top = result.order_by('-amount')[:3]
list(top.values())
because it should deliver what you expect (provided that data is a Queryset)
I am trying to use create table glue api to create the data catalog and thus bypassing the need of crawler because the schema is going to be same every-time.
I am able to create the data catalog and now whenever any updated csv file comes in s3 , the table is updated (as in when i run the athena query it shows the updated table).
But my problem is , that there are cases when i will get only the deltas ( that is only the data which is changed and not a complete data ) , now when only the deltas are coming , and when i am running the athena query manually on the table, it is only showing the deltas , and data is not getting merged with the earlier complete data.
So i am not understanding how shall i update only deltas and merge then in the original data catalog.
Is it even possible with aws glue???
Below is my current code:-
import boto3
import json
client=boto3.client('glue')
def lambda_handler(event, context):
response = client.create_table(
DatabaseName = 'sample',
TableInput = {
'Name': 'lawyertable4',
'Description': 'Table created with boto3 API',
'StorageDescriptor': {
'Columns': [{
'Name': 'id',
'Type': 'bigint',
},
{
'Name': 'username',
'Type': 'string',
},
{
'Name': 'time_stamp',
'Type': 'string',
},
],
'Location': 's3://location/sample/',
'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
'Compressed': False,
'SerdeInfo': {
'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
'Parameters': {
'field.delim': ',',
'skip.header.line.count':'1'
}
}
},
}
)
So I have a nested dictionary in python as follows:
{'name': 'Waffles',
'subCategories': [{'menu': [{'name': 'Fig & Honey with Fresh Cream','price': 120},
{'name': 'Toffeed Banana', 'price': 110}],
'name': 'Sweet',
'description': 'Sweet and yummy'},
{'menu': [{'name': 'Mushroom Cheese Gratin','price': 175},
{'name': 'Pepper Chicken Waffle', 'price': 180}],
'name': 'Savoury'
'description' : 'Salty and yummy'}]
}
What I am looking at is to separate out the dict into 2 dicts as follows:
{'name': 'Waffles(Sweet)',
'menu': [{'name': 'Fig & Honey with Fresh Cream','price': 120},
{'name': 'Toffeed Banana', 'price': 110}],
'description' : 'Sweet and yummy'}
{'name': 'Waffles(Savoury)',
'menu': [{'name': 'Mushroom Cheese Gratin','price': 175},
{'name': 'Pepper Chicken Waffle', 'price': 180}],
'description': 'Salty and yummy'}
Note that the name key is a combination of the same key in the outer and inner dicts
What would be the best way to tackle this ?
Hoping the code is self explantory!!
import pprint
d = {'name': 'Waffles',
'subCategories': [
{'menu': [{'name': 'Fig & Honey with Fresh Cream','price': 120},
{'name': 'Toffeed Banana', 'price': 110}],
'name': 'Sweet',
'description': 'Sweet and yummy'},
{'menu': [{'name': 'Mushroom Cheese Gratin','price': 175},
{'name': 'Pepper Chicken Waffle', 'price': 180}],
'name': 'Savoury',
'description' : 'Salty and yummy'}]
}
menu = []
for category in d.get('subCategories', []):
category['name'] = "{}({})".format(d['name'], category.get('name', ''))
menu.append(category)
pprint.pprint(menu)
and the sample output
[{'description': 'Sweet and yummy',
'menu': [{'name': 'Fig & Honey with Fresh Cream', 'price': 120},
{'name': 'Toffeed Banana', 'price': 110}],
'name': 'Waffles(Sweet)'},
{'description': 'Salty and yummy',
'menu': [{'name': 'Mushroom Cheese Gratin', 'price': 175},
{'name': 'Pepper Chicken Waffle', 'price': 180}],
'name': 'Waffles(Savoury)'}]
This is the code to do what you ask. Note that it can be optimized, but was left as is for readability.
dict = #your dictionary
# list to hold your new dictionaries (if there are more than one)
dict_list = []
# loop to extract
for item in dict['subCategories']:
d = {}
id = item['name']
# this could be made more compact
d.update({'name':'Waffles('+id+')'})
d.update({'menu':item['menu']})
d.update({'description':item['description']})
dict_list.append(d)
# print new dictionaries
for i in dict_list:
print(i)
Note that the program could be made more robust to handle arbitrary yamls (or json, I don't know what format this dictionary came from) if needed. Here the keys for extraction are hard coded.
Best of luck!
ps: there was formatting error in your starting dictionary
I have these models:
User:
email = EmailField()
Payment:
user = ForeignKey(User)
sum = DecimalField()
GuestAccount:
user = ForeignKey(User)
guest = ForeignKey(User)
I want to get user emails, amount of money that came from every user
and number of its guests accounts.
My query:
User.objects.annotate(
money=Sum('payment__sum'),
guests_number=Count('guestaccount')
).values('email', 'money', 'guests_number')
But money and guests_number in the result of the query are bigger then they really are:
{'guests_number': 0, 'email': 'a#b.cd', 'money': None}
{'guests_number': 20, 'email': 'user1#mail.com', 'money': Decimal('6600.00')}
{'guests_number': 4, 'email': 'user1000#test.com', 'money': Decimal('2500.00')}
{'guests_number': 0, 'email': 'zzzz#bbbbb.com', 'money': None}
I noticed that I get correct data if I split the query into 2 separate queries:
User.objects.annotate(money=Sum('payment__sum')).values('email', 'money')
User.objects.annotate(guests_number=Count('guestaccount')).values('email', 'guests_number')
Correct result of 1st half:
{'email': 'a#b.cd', 'money': None}
{'email': 'user1#mail.com', 'money': Decimal('1650.00')}
{'email': 'user1000#test.com', 'money': Decimal('1250.00')}
{'email': 'zzzz#bbbbb.com', 'money': None}
Correct result of 2nd half:
{'email': 'a#b.cd', 'guests_number': 0}
{'email': 'user1#mail.com', 'guests_number': 4}
{'email': 'user1000#test.com', 'guests_number': 2}
{'email': 'zzzz#bbbbb.com', 'guests_number': 0}
Also I noticed that I can add distinct=True in Count aggregation:
User.objects.annotate(
money=Sum('payment__sum'),
guests_number=Count('guestaccount', distinct=True)
).values('email', 'money', 'guests_number')
It fixes guests_number:
{'guests_number': 0, 'email': 'a#b.cd', 'money': None}
{'guests_number': 4, 'email': 'user1#mail.com', 'money': Decimal('6600.00')}
{'guests_number': 2, 'email': 'user1000#test.com', 'money': Decimal('2500.00')}
{'guests_number': 0, 'email': 'zzzz#bbbbb.com', 'money': None}
Unfortunatly, there are no distinct parameter in Sum aggregation.
What is wrong with my query? How to fix these numbers getting bigger with every aggregation in annotate?
Raw SQL query investigation showed that the problem comes from multiple LEFT OUTER JOINs. So I ended up with raw SQL:
User.objects.extra(select={
"money": """
SELECT SUM("website_payment"."sum")
FROM "website_payment"
WHERE "website_user"."id" = "website_payment"."user_id"
""",
"guests_number": """
SELECT COUNT("guests_guestaccount"."id")
FROM "guests_guestaccount"
WHERE "website_user"."id" = "guests_guestaccount"."user_id"
""",
}
).values('email', 'money', 'guests_number')
But I need to annotate these fields into queried objects and extra don't do it.