create and insert into database tables using luigi - python-2.7

I am trying to understand the correct way to drop and recreate a table and insert data into the newly created table using luigi. I have multiple CSV files passed from the prior task to the task which should be inserted into the database.
My current code looks like this.
class CreateTables(sqla.CopyToTable):
connection_string = DatabaseConfig().data_mart_connection_string
table = DatabaseConfig().table_name
def requires(self):
return CustomerJourneyToCSV()
def output(self):
return SQLAlchemyTarget(
connection_string=self.connection_string,
target_table="customerJourney_1",
update_id=self.update_id(),
connect_args=self.connect_args,
echo=self.echo)
def create_table(self, engine):
base = automap_base()
Session = sessionmaker(bind=engine)
session = Session()
metadata = MetaData(engine)
base.prepare(engine, reflect=True)
# Drop existing tables
for i in range(1, len(self.input())+1):
for t in base.metadata.sorted_tables:
if t.name in "{0}_{1}".format(self.table, i):
t.drop(engine)
# Create new tables and insert data
i = 1
for f in self.input():
df = pd.read_csv(f.path, sep="|")
df.fillna(value="", inplace=True)
ts = define_table_schema(df)
t = Table("{0}_{1}".format(self.table, i), metadata, *[Column(*c[0], **c[1]) for c in ts])
t.create(engine)
# TODO: Need to remove head and figure out how to stop the connection from timing out
my_insert = t.insert().values(df.head(500).to_dict(orient="records"))
session.execute(my_insert)
i +=1
session.commit()
The code works creates the tables and inserts the data but falls over with the following error.
C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py:191:
DtypeWarning: Columns (150) have mixed types. Specify dtype option on import
or set low_memory=False.
new_deps = self._run_get_new_deps()
File "C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py", line
191, in run
new_deps = self._run_get_new_deps()
File "C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py", line
129, in _run_get_new_deps
task_gen = self.task.run()
File "C:\Users\simon\AdviceDataMart\lib\site-
packages\luigi\contrib\sqla.py", line 375, in run
for row in itertools.islice(rows, self.chunk_size)]
File "C:\Users\simon\AdviceDataMart\lib\site-
packages\luigi\contrib\sqla.py", line 363, in rows
with self.input().open('r') as fobj:
AttributeError: 'list' object has no attribute 'open'
I am not sure what is causing this and am not able to easily debug a luigi pipeline. I am not sure if this has to do with implementation of the run method or the output method?

Related

Read custom input file(ldif) type/format in GCP DatafLow

I have ldif extension file from LDAP system. I am able to easily parse this in python and extract required data from file and insert into SQL server. My sample python looks like below.
import os
from ldif3 import LDIFParser
import pymssql
parser = LDIFParser(open('temp.ldiff', 'rb'))
def return_dictionary_element_if_present(dict_entry, element):
if dict_entry.get(element):
return dict_entry.get(element)[0]
return ''
def add_new_user():
for dn, entry in parser.parse():
dict_entry = dict(entry)
email = return_dictionary_element_if_present(dict_entry,'email')
password = return_dictionary_element_if_present(dict_entry,'password')
#some code to insert into SQL server
add_new_user()
But when i am looking to convert this to dataflow, iam unable to understand what and where to modify. My Data flow code looks something like below
class sqlserverwriteDoFn(beam.DoFn):
#insert statement
class CreateEntities(beam.DoFn):
def process(self, element):
#figure out how to return dictionary if parsed correctly
return [{"email": email, "password": password}]
def dataflow(input_file, pipeline_options):
print("starting")
options = GoogleCloudOptions.from_dictionary(pipeline_options)
with beam.Pipeline(options=options) as p:
(p | 'Reading Ldif data from GCS' >> beam.io.ReadFromText(input_file)
| 'Create entities' >> beam.ParDo(CreateEntities())
| 'Insert data to SQLSERVER' >> beam.ParDo(sqlserverwriteDoFn(pipeline_options['project']))
)
I think ReadFromText converts each line into pcollection which in my case does not work. Sample ldif file looks like this
dn: uid=12345,ab=users,xy=random
phone: 111
address: someaddress
email: true
username:abc
password:abc
dn: uid=12345,ab=users,xy=random
objectClass: inetOrgPerson
objectClass: top
phone: 111
address: someaddress
email: true
username:abcd
password:abcd
Any ideas is really appreciated as I am looking to import 50 million user names and passwords from LDIF file and definitely simple python for loop can not be scaled.
[Edit1] As per comments, modified code and getting some other error
def return_dictionary_element_if_present(dict_entry, element):
if dict_entry.get(element):
return dict_entry.get(element)[0]
return ''
class CreateEntities(beam.DoFn):
def process(self, file):
parser = LDIFParser(open(file, 'rb'))
arr=[]
for dn, entry in parser.parse():
dict1 ={}
dict_entry = dict(entry)
email = return_dictionary_element_if_present(dict_entry,'email')
password = return_dictionary_element_if_present(dict_entry,'password')
dict1['email'] = email
dict1['password'] = password
arr.append(dict1)
return arr
def dataflow(pipeline_options):
print("starting")
options = GoogleCloudOptions.from_dictionary(pipeline_options)
with beam.Pipeline(options=options) as p:
(p | 'Reading data from GCS' >> MatchFiles(file_pattern="temp.ldiff")
| 'file match' >> ReadMatches()
| 'Create entities' >> beam.ParDo(CreateEntities())
| 'print to screen' >> beam.Map(print)
)
Getting follwoing error
File "dataflow.py", line 26, in process
parser = LDIFParser(open(file, 'rb'))
TypeError: expected str, bytes or os.PathLike object, not ReadableFile [while running 'Create entities']
Edit2
changed one line of python code as below
parser = LDIFParser(file)
Got this error
File "dataflow.py", line 28, in process
for dn, entry in parser.parse():
File "C:\Users\sande\anaconda3\envs\saopaulo\lib\site-packages\ldif3.py", line 383, in parse
for block in self._iter_blocks():
File "C:\Users\sande\anaconda3\envs\saopaulo\lib\site-packages\ldif3.py", line 282, in _iter_blocks
for line in self._iter_unfolded_lines():
File "C:\Users\sande\anaconda3\envs\saopaulo\lib\site-packages\ldif3.py", line 263, in _iter_unfolded_lines
line = self._input_file.readline()
AttributeError: 'ReadableFile' object has no attribute 'readline' [while running 'Create entities']
How should i change my code so that error is resolved?
You are correct, TextIO in the Python SDK uses newline as the delimiter to separate elements. So each element produced is a single line of the input file.
In your original code you already have a parser that can read LDIF files. You can use that in your pipeline via a ParDo transform. I would recommend beginning with FileIO to create a PCollection of LDIF files, and then use those as input to your own ParDo which parses those files and outputs your records. Note that you will likely want to read on managing Beam Python dependencies if you want to use the existing parser on Dataflow, as your Dataflow worker will need access to that dependency.

Batch SQL INSERT in Django App

I am following this example to batch insert records into a table but modifying it to fit my specific example as such
sql='INSERT INTO CypressApp_grammatrix (name, row_num, col_num, gram_amount) VALUES {}'.format(', '.join(['(%s, %s, %s, %s)']*len(gram_matrix)),)
#print sql
params=[]
for gram in gram_matrix:
col_num=1
for g in gram:
params.extend([(matrix_name, row_num, col_num, g)])
col_num += 1
row_num += 1
print params
with closing(connection.cursor()) as cursor:
cursor.execute(sql, params)
However, upon doing so, I receive this error
return cursor._last_executed.decode('utf-8')
File "/usr/local/lib/python2.7/dist-packages/django/db/backends/mysql/base.py", line 150, in __getattr__
return getattr(self.cursor, attr)
AttributeError: 'Cursor' object has no attribute '_last_executed'
I would like to know why I received this error and what I can do to fix it, although I feel the problem could be with this code that works with MySQL that I did not write
def last_executed_query(self, cursor, sql, params):
# With MySQLdb, cursor objects have an (undocumented) "_last_executed"
# attribute where the exact query sent to the database is saved.
# See MySQLdb/cursors.py in the source distribution.
return cursor._last_executed.decode('utf-8')
So I don't know if I simply have an old copy of MySQLdb or what, but the problem appear to be with cursors.py. The only spot in that file where you can find _last_executed is here
def _do_query(self, q):
db = self._get_db()
self._last_executed = q
db.query(q)
self._do_get_result()
return self.rowcount
However, the __init__ does not set up this variable as an instance attribute. It's missing completely. So I took the liberty of adding it myself and initializing it to some query string. I assumed any would do, so I just added
class BaseCursor(object):
"""A base for Cursor classes. Useful attributes:
description
A tuple of DB API 7-tuples describing the columns in
the last executed query; see PEP-249 for details.
description_flags
Tuple of column flags for last query, one entry per column
in the result set. Values correspond to those in
MySQLdb.constants.FLAG. See MySQL documentation (C API)
for more information. Non-standard extension.
arraysize
default number of rows fetchmany() will fetch
"""
from _mysql_exceptions import MySQLError, Warning, Error, InterfaceError, \
DatabaseError, DataError, OperationalError, IntegrityError, \
InternalError, ProgrammingError, NotSupportedError
def __init__(self, connection):
from weakref import ref
...
self._last_executed ="SELECT * FROM T"
...
Now the cursor object does have the attribute _last_executed and when this function
def last_executed_query(self, cursor, sql, params):
# With MySQLdb, cursor objects have an (undocumented) "_last_executed"
# attribute where the exact query sent to the database is saved.
# See MySQLdb/cursors.py in the source distribution.
return cursor._last_executed.decode('utf-8')
in base.py is called, the attribute does exist and so this error
return cursor._last_executed.decode('utf-8')
File "/usr/local/lib/python2.7/dist-
packages/django/db/backends/mysql/base.py", line 150, in __getattr__
return getattr(self.cursor, attr)
AttributeError: 'Cursor' object has no attribute '_last_executed'
will not be encountered. At least that is how I believe it works. In any case, it fixed the situation for me.

Exception when trying to create bigquery table via python API

I'm working on an app that will stream events into BQ. Since Streamed Inserts require the table to pre-exist, I'm running the following code to check if the table exists, and then to create it if it doesn't:
TABLE_ID = "data" + single_date.strftime("%Y%m%d")
exists = False;
request = bigquery.tables().list(projectId=PROJECT_ID,
datasetId=DATASET_ID)
response = request.execute()
while response is not None:
for t in response.get('tables', []):
if t['tableReference']['tableId'] == TABLE_ID:
exists = True
break
request = bigquery.tables().list_next(request, response)
if request is None:
break
if not exists:
print("Creating Table " + TABLE_ID)
dataset_ref = {'datasetId': DATASET_ID,
'projectId': PROJECT_ID}
table_ref = {'tableId': TABLE_ID,
'datasetId': DATASET_ID,
'projectId': PROJECT_ID}
schema_ref = SCHEMA
table = {'tableReference': table_ref,
'schema': schema_ref}
table = bigquery.tables().insert(body=table, **dataset_ref).execute(http)
I'm running python 2.7, and have installed the google client API through PIP.
When I try to run the script, I get the following error:
No handlers could be found for logger "oauth2client.util"
Traceback (most recent call last):
File "do_hourly.py", line 158, in <module>
main()
File "do_hourly.py", line 101, in main
body=table, **dataset_ref).execute(http)
File "build/bdist.linux-x86_64/egg/oauth2client/util.py", line 142, in positional_wrapper
File "/usr/lib/python2.7/site-packages/googleapiclient/http.py", line 721, in execute
resp, content = http.request(str(self.uri), method=str(self.method),
AttributeError: 'module' object has no attribute 'request'
I tried researching the issue, but all I could find was info about confusing between urllib, urllib2 and Python 2.7 / 3.
I'm not quite sure how to continue with this, and will appreciate all help.
Thanks!
Figured out that the issue was in the following line, which I took from another SO thread:
table = bigquery.tables().insert(body=table, **dataset_ref).execute(http)
Once I removed the "http" variable, which doesn't exist in my scope, the exception dissappeared

What is wrong with Django csv upload code?

Here is my code. I would like to import csv and save it to database via model.
class DataInput(forms.Form):
file = forms.FileField(label="Select CSV file")
def save(self, mdl):
records = csv.reader(self.cleaned_data["file"].read().decode('utf-8'), delimiter=',')
if mdl=='auction':
auction = Auction()
for line in records:
auction.auction_name = line[0]
auction.auction_full_name = line[1]
auction.auction_url = line[2]
auction.is_group = line[3]
auction.save()
Now, it throws the following error.
Exception Type: IndexError
Exception Value: list index out of range
csv file
RTS,Rapid Trans System,www.rts.com,TRUE
ZAA,Zelon Advanced Auton,www.zaa.info,FALSE
Really stuck. Please, help.
First of all, the full stacktrace should reveal exactly where the error is. Give Django the --traceback argument, e.g. ./manage.py --traceback runserver.
As Burhan Khalid mentioned 10 minutes ago you miss the 5th column in your csv file (index 4), so that is the root of the error.
Once you read the file with .read(), you are passing in the complete string - which is why each row is an individual character.
You need to pass the entire file object, without reading it first:
records = csv.reader(self.cleaned_data["file"], delimiter=',')
If you need to decode it first, then you had better run through the file yourself:
for line in self.cleaned_data['file'].read().decode('utf-8').split('\n'):
if line.strip():
try:
name, full_name, url, group = line.split(',')
except ValueError:
print('Invalid line: {}'.format(line))
continue
i = Auction()
i.auction_name = name
i.action_full_name = full_name
i.auction_url = url
i.is_group = group
i.save()

cannot read tables from sqlite3 database attached in python

I can connect to a database in sqlite3, attach another database and run an inner join to retrieve records from two tables, one in each database. But when I try to do the same with a python script running on the command line, I get no results - the error reads that the table (in the attached database) does not exist.
import sqlite3 as lite
db_acts = '/full/path/to/activities.db'
db_sign = '/full/path/to/sign_up.db'
def join_tables():
try:
con = lite.connect(db_acts)
cursor = con.cursor()
cursor.execute("attach database 'db_sign' as 'sign_up'")
cursor.execute("select users.ID, users.Email, users.TextMsg from sign_up.users INNER JOIN db_acts.alerts on sign_up.users.ID = db_acts.alerts.UID")
rows = cursor.fetchall()
for row in rows:
print 'row', row
con.commit()
con.close()
except lite.Error, e:
print 'some error'
sys.exit(1)
The response on localhost is the same as on the HostGator remote host where I just ran a test (it's a new site without user inputs at the moment). I have no problem reading rows from tables in the original database connection - only the tables in the attached database are not read. The attachment works at least partially - a print statement to attach it in the except clause shows that the database is in use.