Import xls file (more than 5000 lines) into my sqlite database takes so long.
def importeradsl(request):
if "GET" == request.method:
else:
excel_file = request.FILES["excel_file"]
#you may put validations here to check extension or file size
wb = openpyxl.load_workbook(excel_file)
#getting a particular sheet by name out of many sheets
worksheet = wb["Sheet 1"]
#iterating over the rows and getting value from each cell in row
for row in worksheet.iter_rows(min_row=2):
row_data = list()
for cell in row:
row_data.append(str(cell.value))
#Get content fields DerangementCuivre models
#Client
nd = row_data[0]
nom_client = row_data[3]
nd_contact = row_data[4]
#Categorie
code_categorie = row_data[6]
acces_reseau = row_data[8]
etat = row_data[9]
origine = row_data[10]
code_sig = row_data[11]
agent_sig = row_data[13]
date_sig = dt.datetime.strftime(parse(row_data[14]), '%Y-%m-%d %H:%M:%S')
date_essai = dt.datetime.strftime(parse(row_data[15]), '%Y-%m-%d %H:%M:%S')
agent_essai = row_data[18]
try:
date_ori = dt.datetime.strptime(row_data[19], '%Y-%m-%d %H:%M:%S')
except ValueError as e:
print ("Vous", e)
else:
date_ori = dt.datetime.strftime(parse(row_data[19]), '%Y-%m-%d %H:%M:%S')
agent_ori = row_data[20]
code_ui = row_data[21]
equipe = row_data[22]
sous_traitant = row_data[23]
date_pla = dt.datetime.strftime(parse(row_data[24]), '%Y-%m-%d %H:%M:%S')
date_rel = dt.datetime.strftime(parse(row_data[25]), '%Y-%m-%d %H:%M:%S')
date_releve = dt.datetime.strptime(row_data[25], '%Y-%m-%d %H:%M:%S')
date_essais = dt.datetime.strptime(row_data[15], '%Y-%m-%d %H:%M:%S')
pst = pytz.timezone('Africa/Dakar')
date_releve = pst.localize(date_releve)
utc = pytz.UTC
date_releve = date_releve.astimezone(utc)
date_essais = pst.localize(date_essais)
date_essais = date_essais.astimezone(utc)
code_rel = row_data[26]
localisation = row_data[27]
cause = row_data[28]
commentaire = row_data[29]
agent_releve = row_data[30]
centre_racc = row_data[32]
rep = row_data[33]
srp = row_data[34]
delai = (date_releve - date_essais).total_seconds()
dali = divmod(delai, 86400)[0]
semaine = date_releve.isocalendar()[1]
mois = date_releve.month
annee = date_releve.year
if dali > 7:
etats = "PEX PLUS"
else:
etats = "PEX"
#Enregistrer un client
Client(nd=nd, nom=nom_client, mobile=nd_contact).save()
#Enregistrer la categorie
#Code pour nom categorie - renseigner plus tard
Categorie(code_categorie=code_categorie, nom="Public").save()
#Enregistrer agent de signalisation
AgentSig(matricule=agent_sig, nom="Awa").save()
#Enregistrer agent d'essai
AgentEssai(matricule=agent_essai).save()
#Enregister agent d'orientation
AgentOri(matricule=agent_ori).save()
#Enregistrer agent de relève
AgentRel(matricule=agent_releve).save()
#Enregistrer le sous-traitant
SousTraitant(nom=sous_traitant).save()
#Enregistrer le centre
Centre(code=centre_racc).save()
#Enregistrer ui
UniteIntervention(code_ui=code_ui,
sous_traitant=SousTraitant.objects.get(nom=sous_traitant)).save()
#Enregistrer le repartiteur
Repartiteur(code=rep, crac=Centre.objects.get(code=centre_racc)).save()
#Enregistrer team
Equipe(nom=equipe, unite=UniteIntervention.objects.get(code_ui=code_ui)).save()
#Enregistrer le SR
SousRepartiteur(code=srp, rep=Repartiteur.objects.get(code=rep)).save()
#Enregistrer le drangement
DerangementAdsl(acces_reseau=acces_reseau,
nd_client=Client.objects.get(nd=nd),
categorie=Categorie(code_categorie=code_categorie),
etat=etat,
origine=origine,
code_sig=code_sig,
agent_sig=AgentSig.objects.get(matricule=agent_sig),
date_sig=date_sig,
date_essai=date_essai,
agent_essai=AgentEssai.objects.get(matricule=agent_essai),
date_ori=date_ori,
agent_ori=AgentOri.objects.get(matricule=agent_ori),
sous_traitant=SousTraitant.objects.get(nom=sous_traitant),
unite_int = UniteIntervention.objects.get(code_ui=code_ui),
date_pla=date_pla,
date_rel=date_rel,
code_rel=code_rel,
code_local=localisation,
cause=cause,
comment_cause=commentaire,
agent_rel=AgentRel.objects.get(matricule=agent_releve),
centre=Centre.objects.get(code=centre_racc),
rep=Repartiteur.objects.get(code=rep),
srep=SousRepartiteur.objects.get(code=srp),
delai=dali,
etat_vr=etats,
semaine=semaine,
mois=mois,
annee=annee).save()
There are few things that are incorrect.
I propose to you the following approach:
Make your code more readable
Remove useless queries
Avoid related records duplication
Cache out your related instances.
Use bulk_create
Looking at your code, with a rough estimation, per csv record, you will get over 30 SQL queries per row, that's a bit much...
1. Make you code more readable.
Your parsing logic can be DRYed, a lot.
First, identify what you do with your data.
From my point of view, 2 main functions:
Do nothing:
def no_transformation(value)
return str(value)
Parse dates
def strptime(value):
"""
I can't really tell what your 'parse' function does, I let it be but it might
be interesting adding your logic in here
"""
return dt.datetime.strptime(parse(str(value)), '%Y-%m-%d %H:%M:%S')
Now, you can declare your parser configuration:
PARSER_CONFIG=(
#(column_index, variable_name, transformation_function)
(0,'nd',no_transformation),
(10,'origine',no_transformation),
(11,'code_sig',no_transformation),
(13,'agent_sig',no_transformation),
(14,'date_sig',strptime),
(15,'date_essai',strptime),
(18,'agent_essai',no_transformation),
(19,'date_ori',strptime),
(20,'agent_ori',no_transformation),
(21,'code_ui',no_transformation),
(22,'equipe',no_transformation),
(23,'sous_traitant',no_transformation),
(24,'date_pla',strptime),
(25,'date_rel',strptime),
(26,'code_rel',no_transformation),
(27,'localisation',no_transformation),
(28,'cause',no_transformation),
(29,'commentaire',no_transformation),
(3,'nom_client',no_transformation),
(30,'agent_releve',no_transformation),
(32,'centre_racc',no_transformation),
(33,'rep',no_transformation),
(34,'srp',no_transformation),
(4,'nd_contact',no_transformation),
(6,'code_categorie',no_transformation),
(8,'acces_reseau',no_transformation),
(9,'etat',no_transformation),
(15',date_essais',strptime),
(19',date_ori',strptime),
(25',date_releve',strptime),
)
Now, you know how to parse your data, and how to name it.
Let just put that stuff into a dict.
def parse(row):
"""Transform a row into a dict
Args:
row (tuple): Your row's data
Returns:
dict: Your parsed data, named into a dict.
"""
return {
key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
}
From here, your parser is way more readable, you know exactly what you're doing with your data.
Wrapping this up all together, you should get:
PARSER_CONFIG=(
#(column_index, variable_name, transformation_function)
#...
)
def no_transformation(value)
return str(value)
def strptime(value)
return str(value)
def parse(row):
"""Transform a row into a dict
Args:
row (tuple): Your row's data
Returns:
dict: Your parsed data, named into a dict.
"""
return {
key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
}
for row in rows:
item = parse(row) #< Your data, without related instances yet....
Still have some work to create your related instances, but we'll get there eventually.
2. Removing useless queries.
You do :
#...First, your create a record
Client(nd=nd, nom=nom_client, mobile=nd_contact).save()
#... Then you fetch it when saving DerangementAdsl
nd_client=Client.objects.get(nd=nd)
While a more pythonic way of doing this would be:
#... You create and assign your istance.
client = Client(nd=item.get('nd'),
nom=item.get('nom_client'),
mobile=item.get('nd_contact')).save()
#...
nd_client=client
You just earned one SQL query/row! Doing the same logic for each models, and you'll earn around 20 queries per row!
categorie=Categorie.objects.create(code_categorie=item.get('code_categorie'), nom="Public"),
#Enregistrer agent de signalisation
agent_sig=AgentSig.objects.create(matricule=item.get('agent_sig'), nom="Awa"),
#Enregistrer agent d'essai
agent_essai=AgentEssai.objects.create(matricule=item.get('agent_essai')),
#Enregister agent d'orientation
agent_ori=AgentOri.objects.create(matricule=item.get('agent_ori')),
#Enregistrer agent de relève
agent_rel=AgentRel.objects.create(matricule=item.get('agent_releve')),
#Enregistrer le sous-traitant
sous_traitant=SousTraitant.objects.create(nom=item.get('sous_traitant')),
#Enregistrer le centre
centre=Centre.objects.create(code=item.get('centre_racc')),
#Enregistrer ui
unite_int=UniteIntervention.objects.create(code_ui=item.get('code_ui'), sous_traitant=sous_traitant), # < You earn one extrat query with sous_traitant
#Enregistrer le repartiteur
rep=Repartiteur.objects.create(code=item.get('rep'), crac=centre), # < You earn one extrat query with centre
#Enregistrer team
equipe=Equipe.objects.create(nom=item.get('equipe')), unite=unite_int),# < You earn one extrat query with unite_int
#Enregistrer le SR
srep=SousRepartiteur.objects.create(code=item.get('srp'), rep=rep),# < You earn one extrat query with rep
3. Avoid related records duplication
Now there is one big issue:
Considering you have multiple rows for each client,
you'll eventually find yourself with many duplicates, and you do not want that.
Instead of using create, you should go with get_or_create.
Please note it returns a tuple: (instance, created)
So.... your code should go like:
categorie, categorie_created=Categorie.objects.get_or_create(code_categorie=item.get('code_categorie'), nom="Public"),
agent_sig, agent_sig_created=AgentSig.objects.get_or_create(matricule=item.get('agent_sig'), nom="Awa"),
agent_essai, agent_essai_created=AgentEssai.objects.get_or_create(matricule=item.get('agent_essai')),
agent_ori, agent_ori_created=AgentOri.objects.get_or_create(matricule=item.get('agent_ori')),
agent_rel, agent_rel_created=AgentRel.objects.get_or_create(matricule=item.get('agent_releve')),
sous_traitant, sous_traitant_created=SousTraitant.objects.get_or_create(nom=item.get('sous_traitant')),
centre, centre_created=Centre.objects.get_or_create(code=item.get('centre_racc')),
unite_int, unite_int_created=UniteIntervention.objects.get_or_create(code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
rep, rep_created=Repartiteur.objects.get_or_create(code=item.get('rep'), crac=centre)
equipe, equipe_created=Equipe.objects.get_or_create(nom=item.get('equipe')), unite=unite_int
srep, srep_created=SousRepartiteur.objects.get_or_create(code=item.get('srp'), rep=rep)
Tadaaaaam, you'll create records that are "only" necessary for your related objects.
4. Caching out your related objects.
As in previous topic, I consider you have multiple rows for each related instance,
and for each row, you will still get to fetch that from your DB.
It's OK I guess if you're using SQLite in memory, it won't be as slow as with other DBs, still, it'll be a bottleneck.
You could use an approach like:
MODEL_CACHE = {}
def get_related_instance(model, **kwargs):
key = (model,kwargs)
if key in MODEL_CACHE:
return instance MODEL_CACHE[key]
else:
instance, create = model.objects.get_or_create(**kwargs)
MODEL_CACH[key]=instance
return instance
# Instead of having previous lines now you end up with:
categorie = get_related_instance(Categorie,code_categorie=item.get('code_categorie'), nom="Public"),
agent_sig = get_related_instance(AgentSig,matricule=item.get('agent_sig'), nom="Awa"),
agent_essai = get_related_instance(AgentEssai,matricule=item.get('agent_essai')),
agent_ori = get_related_instance(AgentOri,matricule=item.get('agent_ori')),
agent_rel = get_related_instance(AgentRel,matricule=item.get('agent_releve')),
sous_traitant = get_related_instance(SousTraitant,nom=item.get('sous_traitant')),
centre = get_related_instance(Centre,code=item.get('centre_racc')),
unite_int = get_related_instance(UniteIntervention,code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
rep = get_related_instance(Repartiteur,code=item.get('rep'), crac=centre)
equipe = get_related_instance(Equipe,nom=item.get('equipe')), unite=unite_int
srep = get_related_instance(SousRepartiteur,code=item.get('srp'), rep=rep)
I cannot tell how much you'll gain thanks to that, it really depends on the data set you're trying to import,
but from experience, it's quite drastic!
5 Use bulk_create
You are doing
for row in rows:
DerangementAdsl(...your data...).save() #<That's one DB call
That's one SQL query per row, while you could do:
ITEMS = []
for row in rows:
#...Your parsing we saw previously...
ITEMS.append(DerangementAdsl(**item))
DerangementAdsl.objects.bulk_create(ITEMS) #<That's one DB call
Putting it all together!
PARSER_CONFIG=(
#(column_index, variable_name, transformation_function)
#...
)
def no_transformation(value)
return str(value)
def strptime(value)
return str(value)
MODEL_CACHE = {}
def get_related_instance(model, **kwargs):
key = (mode,kwargs)
if key in MODEL_CACHE:
return instance MODEL_CACHE[key]
else:
instance, create = model.objects.get_or_create(**kwargs)
MODEL_CACH[key]=instance
return instance
def parse(row):
"""Transform a row into a dict
Args:
row (tuple): Your row's data
Returns:
dict: Your parsed data, named into a dict.
"""
item= {
key:tranfsorm(row[index]) for index, key, transform in PARSER_CONFIG
}
item.update({
'categorie': get_related_instance(Categorie,code_categorie=item.get('code_categorie'), nom="Public"),
'agent_sig': get_related_instance(AgentSig,matricule=item.get('agent_sig'), nom="Awa"),
'agent_essai': get_related_instance(AgentEssai,matricule=item.get('agent_essai')),
'agent_ori': get_related_instance(AgentOri,matricule=item.get('agent_ori')),
'agent_rel': get_related_instance(AgentRel,matricule=item.get('agent_releve')),
'sous_traitant': get_related_instance(SousTraitant,nom=item.get('sous_traitant')),
'centre': get_related_instance(Centre,code=item.get('centre_racc')),
'unite_int': get_related_instance(UniteIntervention,code_ui=item.get('code_ui'), sous_traitant=sous_traitant)
'rep': get_related_instance(Repartiteur,code=item.get('rep'), crac=centre)
'equipe': get_related_instance(Equipe,nom=item.get('equipe')), unite=unite_int
'srep': get_related_instance(SousRepartiteur,code=item.get('srp'), rep=rep)
})
return item
def importeradsl(request):
#I skip your conditions for readility
ITEMS = []
for row in worksheet.iter_rows(min_row=2):
ITEMS.append(DerangementAdsl(**parse(row)))
DerangementAdsl.objects.bulk_create(ITEMS)
Conclusion
Following those recommendation, you should end up with an optimized script that will run way faster than the original one, and be way more readable and pythonic
Roughly, depending on your dataset, 5k lines should run somewhere between 10 seconds up to few minutes.
If each row's related instance (client,category...) is unique, I'd use a more sophisticated approach looping multiple times over your dataset to create related models using bulk_create and cache them out like:
CLIENTS = []
for row in rows:
CLIENTS.append(Client(**client_parser(row)))
clients=Client.objects.bulk_create(CLIENTS) # You Create *all* your client with only one DB call!
Then, you cache all created clients. You do the same for all your related models and eventually you'll load your data making a dozen of DB calls, but it really depends on your business logic here: It should be engineered to handle duplicated records too.
I'm trying to create a Soccer match program for a couple of my friends. What I currently have a class to instantiate teams and I'm writing the match instantiating class.
What I currently have is:
class Team(object):
def __init__(self, name, games=0, points=0, goals=0, wins=0, loses=0):
self.name = name
self.points = points
self.games = games
self.goals = goals
self.wins = wins
self.loses = loses
def win(self):
self.points += 3
self.wins += 1
self.games += 1
def lose(self):
self.points += 1
self.loses += 1
self.games += 1
def ratio(self):
print "(Wins/Loses/Games)\n ", self.wins, "/", self.loses, "/", self.games
class Match(object):
def __init__(self, teamagoals, teambgoals):
self.teamagoals = teamagoals
self.teambgoals = teambgoals
def playgame(teama, teamb):
#conditional statements which will decide who wins
alpha = Team("George's Team")
beta = Team("Josh's Team")
gamma = Team("Fred's Team")
At this point I run into issues about how to go about doing this. As per my question, I'm trying to - in the Match() class - involve two instances of the team class. For example, I would like to call a function in Match() and specify that team alpha and team gamma play against each other, then when they win or lose modify their instances corresponding point, games, etc, values.
What is the way I can do this? Is it possible through putting all of the Team() instances into a list, then importing that list into the Match() class? Or is there some other, more elegant, way? Please help.
I'm not sure exactly how you intend to select which teams play each other, but say that's done by one function. Then, Match could be implemented as follows:
class Match(object):
def __init__(self, teama, teamb):
self.teama = teama
self.teamb = teamb
def play(self):
a_points = 0
b_points = 0
# code/function call to decide how many points each team gets.
self.teama.points += a_points
self.teamb.points += b_points
if a_points > b_points:
self.teama.wins += 1
else if b_points > a_points:
self.teamb.wins += 1
else:
# code for when the teams draw
I'm not sure how you intend to structure some of the functions, like deciding how many points teams get, but this is a framework that is hopefully useful/gives you ideas on how you could do things.
Following up on your comment, if you would like to be able to pass in specific teams created outside the class, you could do so as follows:
match1 = Match(gamma, alpha). This will create a new match instance, where the teams 'gamma' and 'alpha' will be modified. (teama and teamb in the class will refer to these objects.)
I am working on a project in which I have to develop bio-passwords based on user's keystroke style.
Suppose a user types a password for 20 times, his keystrokes are recorded, like
holdtime : time for which a particular key is pressed.
digraph time : time it takes to press a different key.
suppose a user types a password " COMPUTER". I need to know the time for which every key is pressed. something like :
holdtime for the above password is
C-- 200ms
O-- 130ms
M-- 150ms
P-- 175ms
U-- 320ms
T-- 230ms
E-- 120ms
R-- 300ms
The rational behind this is , every user will have a different holdtime. Say a old person is typing the password, he will take more time then a student. And it will be unique to a particular person.
To do this project, I need to record the time for each key pressed.
I would greatly appreciate if anyone can guide me in how to get these times.
Editing from here..
Language is not important, but I would prefer it in C. I am more interested in getting the dataset.
Record the KeyDown and KeyUp events, and do a diff on the timestamps of each.
http://code.activestate.com/recipes/203830/
Edit:
You may want to check out wxPython, it should help you out:
http://www.wxpython.org/onlinedocs.php
in particular:
http://docs.wxwidgets.org/stable/wx_wxkeyevent.html#wxkeyevent
You mentioned you'd prefer it in C, but since you tagged it Python... :)
Also, since you say you're looking for building a dataset, I assume you'll have to invite users to type in arbitrary text, so you'll need some sort of interface (graphical or otherwise).
Here's a quick example using pygame. You can trivially modify it to ask users to type specific words, but, as it is, it'll just let the user type in arbitrary text, record pressing times for all keypresses, and print each hold and digraph times, in the order that the user typed it, when it exits (i.e., when the user presses Esc).
As Kibibu noticed, showing the user what he's typing in realtime introduces a delay which might mask real key-pressing times, so this code only displays what the user has typed when he types "Enter".
Update: it now calculates digraph as well as hold times (excluding Enter in both cases).
Update2: Per Adi's request, changed from displaying average to displaying each individual time, in order.
import sys
from collections import defaultdict
from time import time
import pygame
from pygame.key import name as keyname
from pygame.locals import *
# Mapping of a key to a list of holdtimes (from which you can average, etc)
holdtimes = defaultdict(list)
# Mapping of a key pair to a list of digraph times
digraphs = defaultdict(list)
# Keys which have been pressed down, but not up yet.
pending = {}
# Last key to be de-pressed, corresponding time).
last_key = None
# Text that the user has typed so far (one sublist for every Enter pressed)
typed_text = [[]]
def show_times():
all_text = [k for line in typed_text for k in line]
print "Holdtimes:"
for key in all_text:
print "%s: %.5f" % (key, holdtimes[key].pop(0))
print "Digraphs:"
for key1, key2 in zip(all_text, all_text[1:]):
print "(%s, %s): %.5f" % (key1, key2,
digraphs[(key1, key2)].pop(0))
def time_keypresses(events):
global last_key
for event in events:
if event.type == KEYDOWN:
# ESC exits the program
if event.key == K_ESCAPE:
show_times()
sys.exit(0)
t = pending[event.key] = time()
if last_key is not None:
if event.key != K_RETURN:
digraphs[(last_key[0], keyname(event.key))].append(t - last_key[1])
last_key = None
elif event.type == KEYUP:
if event.key == K_RETURN:
update_screen()
typed_text.append([])
pending.pop(event.key)
last_key = None
else:
t = time()
holdtimes[keyname(event.key)].append(t - pending.pop(event.key))
last_key = [keyname(event.key), t]
typed_text[-1].append(keyname(event.key))
# Any other event handling you might have would go here...
def update_screen():
global screen
screen.fill((255, 255, 255))
header_font = pygame.font.Font(None, 42)
header = header_font.render("Type away! Press 'Enter' to show.", True, (0, 0, 0))
header_rect = header.get_rect()
header_rect.centerx = screen.get_rect().centerx
header_rect.centery = screen.get_rect().centery - 100
text_font = pygame.font.Font(None, 32)
user_text = text_font.render("".join(typed_text[-1]) if typed_text[-1] else "...",
True, (0, 0, 255))
text_rect = user_text.get_rect()
text_rect.centerx = screen.get_rect().centerx
text_rect.centery = screen.get_rect().centery
screen.blit(header, header_rect)
screen.blit(user_text, text_rect)
pygame.display.update()
if __name__ == '__main__':
pygame.init()
window = pygame.display.set_mode((800, 600))
screen = pygame.display.get_surface()
update_screen()
while True:
time_keypresses(pygame.event.get())
Have a look at ncurses. It is a terrific tool for getting information about keypresses in the terminal.
Have a look at this link too.
If you read from the terminal in conical mode, you can read each keystroke as it's pressed. You won't see keydown keyup events, like you could if you trapped X events, but it's probably easier, especially if you're just running in a console or terminal.
The answer is conditionally "yes".
If your languages/environment has interactive keyboard support that offers Key-Down and Key-Up events, then you catch both events and time the difference between them.
This would be trivially easy in JavaScript on a web page, which would also be the easiest way to show off your work to a wider audience.