only apply date format to date columns with xlwt - python-2.7

I have results from a database search where some columns have dates, but not others. I need help to write
data = [['556644', 'Mr', 'So', 'And', 'So', Decimal('0.0000'), datetime.datetime(2012, 2, 25, 0, 0), '', False, datetime.datetime(2013, 6, 30, 0, 0)],...]
into an Excel spreadsheet so that
easyxf(num_format_str='DD/MM/YYYY')
is only applied to the datetime columns. I'm quite new to Python and I've been banging my head on this for quite a few days now. Thanks!

After simplifying your self-answer:
date_xf = easyxf(num_format_str='DD/MM/YYYY') # sets date format in Excel
data = [list(n) for n in cursor.fetchall()]
for row_index, row_contents in enumerate(data):
for column_index, cell_value in enumerate(row_contents):
if isinstance(cell_value, datetime.date):
sheet1.write(row_index+1, column_index, cell_value, date_xf)
else:
sheet1.write(row_index+1, column_index, cell_value)

This is being imported from the SQL server with pyodbc, so that's from where cursor.fetchall() is coming:
data = [list(n) for n in cursor.fetchall()]
for row_index, row_contents in enumerate(data):
for column_index, cell_value in enumerate(row_contents):
xf=None
if isinstance(data[row_index][column_index], datetime.date):
xf = easyxf(num_format_str='DD/MM/YYYY') # sets date format in Excel
if xf:
sheet1.write(row_index+1, column_index, cell_value, xf)
else:
sheet1.write(row_index+1, column_index, cell_value)
Done! :)

Want to write a date to a cell in Excel:
Add an extra attribute to the write call - the style for the cell:
import datetime
import xlwt
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('test')
date_style = xlwt.easyxf(num_format_str='YYYY/MM/DD')
worksheet.write(0, 0, 'Today')
worksheet.write(1, 0, datetime.date.today(), date_style)
workbook.save('C:\\data.xls')

Related

Replace Null values with median in pyspark

How can I replace null values with median in the columns Age and Height below data set df.
df = spark.createDataFrame([(1, 'John', 1.79, 28,'M', 'Doctor'),
(2, 'Steve', 1.78, 45,'M', None),
(3, 'Emma', 1.75, None, None, None),
(4, 'Ashley',1.6, 33,'F', 'Analyst'),
(5, 'Olivia', 1.8, 54,'F', 'Teacher'),
(6, 'Hannah', 1.82, None, 'F', None),
(7, 'William',None, 42,'M', 'Engineer'),
(None,None,None,None,None,None),
(8,'Ethan',1.55,38,'M','Doctor'),
(9,'Hannah',1.65,None,'F','Doctor'),
(10,'Xavier',1.64,43,None,'Doctor')]
, ['Id', 'Name', 'Height', 'Age', 'Gender', 'Profession'])
In the post Replace missing values with mean - Spark Dataframe I used the function given
from pyspark.ml.feature import Imputer
imputer = Imputer(
inputCols=df.columns,
outputCols=["{}_imputed".format(c) for c in df.columns])
imputer.fit(df).transform(df)
It throws me an error.
IllegalArgumentException: 'requirement failed: Column Id must be of type equal to one of the following types: [DoubleType, FloatType] but was actually of type LongType.'
So please help.
Thank you
It's likely an initial casting error (I had some strings I needed to be floats). To convert all cols to floats do:
from pyspark.sql.functions import col
df = df.select(*(col(c).cast("float").alias(c) for c in df.columns))
Then you should be fine to impute. Note: I set my strategy to median rather than mean.
from pyspark.ml.feature import Imputer
imputer = Imputer(
inputCols=df.columns,
outputCols=["{}_imputed".format(c) for c in df.columns]
).setStrategy("median")
# Add imputation cols to df
df = imputer.fit(df).transform(df)
I'd be interested in a more elegant solution but I separately imputed the categoricals from the numerics. To impute the categoricals I got the most common value and filled the blanks with it using the when and otherwise functions:
import pyspark.sql.functions as F
for col_name in ['Name', 'Gender', 'Profession']:
common = df.dropna().groupBy(col_name).agg(F.count("*")).orderBy('count(1)', ascending=False).first()[col_name]
df = df.withColumn(col_name, F.when(F.isnull(col_name), common).otherwise(df[col_name]))
To impute the numerics before running the imputer lines I simply casting the Age and Id columns as doubles circumvents the issue for the numeric fields and restrict the imputer to the numerical columns.
from pyspark.ml.feature import Imputer
df = df.withColumn("Age", df['Age'].cast('double')).withColumn('Id', df['Id'].cast('double'))
imputer = Imputer(
inputCols=['Id', 'Height', 'Age'],
outputCols=['Id', 'Height', 'Age'])
imputer.fit(df).transform(df)

ValueError: invalid literal for float(): when adding annotation in pandas

I get this error when I try to add an annotation to my plot - ValueError: invalid literal for float(): 10_May.
my dataframe:
my code (I use to_datetime and strftime before ploting as I needed to sort dates which were stored as strings):
# dealing with dates as strings
grouped.index = pd.to_datetime(grouped.index, format='%d_%b')
grouped = grouped.sort_index()
grouped.index = grouped.index.strftime('%d_%b')
plt.annotate('Peak',
(grouped.index[9], grouped['L'][9]),
xytext=(15, 15),
textcoords='offset points',
arrowprops=dict(arrowstyle='-|>'))
grouped.plot()
grouped.index[9] returns u'10_May' while grouped['L'][9] returns 10.0.
I know that pandas expect index to be float but I thought I can access it by df.index[]. Will appreciate your suggestions.
For me works first plot and then get index position by Index.get_loc:
ax = df.plot()
ax.annotate('Peak',
(df.index.get_loc(df.index[9]), df['L'][9]),
xytext=(15, 15),
textcoords='offset points',
arrowprops=dict(arrowstyle='-|>'))
Sample:
np.random.seed(10)
df = pd.DataFrame({'L':[3,5,0,1]}, index=['4_May','3_May','1_May', '2_May'])
#print (df)
df.index = pd.to_datetime(df.index, format='%d_%b')
df = df.sort_index()
df.index = df.index.strftime('%d_%b')
df.plot()
plt.annotate('Peak',
(df.index.get_loc(df.index[2]), df['L'][2]),
xytext=(15, 15),
textcoords='offset points',
arrowprops=dict(arrowstyle='-|>'))
EDIT:
More general solution with get_loc + idxmax + max:
ax = df.plot()
ax.annotate('Peak',
(df.index.get_loc(df['L'].idxmax()), df['L'].max()),
xytext=(15, 15),
textcoords='offset points',
arrowprops=dict(arrowstyle='-|>'))

Put information from a dabtabse file into lists

import sqlite3
db = sqlite3.connect('newdb.db')
team_list = ['Munster', 'Leinster', 'Ulster', 'Glasgow']
cursor = db.cursor()
for i in range(len(team_list)):
team_names = team_list[i].upper()
searchStr = '%' + team_names + '%'
cursor.execute('select * from tickets where Name LIKE ?', (searchStr,))
teams_points = cursor.fetchall()
print teams_points
cursor.close()
db.close()
This is my python code used to display all data in the table 'tickets' in newdb.db. I have a list with the team names and i want to be able to search these team names in the database and calculate information on tickets sold.
picture of database
[(u'MUNSTER', 5, u'First Round'), (u'MUNSTER', 5, u'First Round'),
(u'MUNSTER', 8, u'Second Round'), (u'MUNSTER', 10, u'Both Rounds')]
[(u'LEINSTER', 2, u'Second Round'), (u'LEINSTER', 16, u'First Round'),
(u'LEINSTER', 5, u'Both Rounds'), (u'LEINSTER', 6, u'Both Rounds'),
(u'LEINSTER', 3, u'First Round')]
[(u'ULSTER', 10, u'Second Round')]
[(u'GLASGOW', 4, u'First Round')]
Above is my output when I run the script, i want to be able put each team into a list as
team_list=['team_name', 'total first round tickets', 'second round tickets']
munster_list = ['MUNSTER', '20', '18']
leinster_list = ['LEINSTER','30','13']
ulster_list = ['ULSTER','0','10']
glasgow_list = ['GLASGOW','4','0']
so then to print the list I can just use print munster_list
Use GROUP BY to get one output row from the rows in each group. Use CASE expressions to sum up only certain values:
SELECT Name,
sum(CASE WHEN Type IN ('First Round', 'Both Rounds')
THEN Amount
ELSE 0
END) AS "first round tickets",
sum(CASE WHEN Type IN ('Second Round', 'Both Rounds')
THEN Amount
ELSE 0
END) AS "second round tickets"
FROM tickets
GROUP BY Name
ORDER BY Name;

How to write value inside existing Excel sheet?

Here is the piece I am trying to write the value inside existing excel sheet in particular cell but value is not printing inside that sheet,how to write that value,here I used xlutils.copy
from datetime import datetime, timedelta, date
from xlrd import open_workbook
from xlwt import Workbook
from xlutils.copy import copy
import xlrd
import datetime
book = open_workbook('Data.xlsx')
sheet = book.sheet_by_index(0)
# read header values into the list
keys = [sheet.cell(0, col_index).value for col_index in xrange(sheet.ncols)]
dict_list = []
#read the excel sheet data into list
for row_index in xrange(1, sheet.nrows):
d = {keys[col_index]: sheet.cell(row_index, col_index).value
for col_index in xrange(sheet.ncols)}
dict_list.append(d)
TotalEffort = 0
#convert the integer date to YMD format
for count in range(len(dict_list)):
year, month, day, hour, minute, second = xlrd.xldate_as_tuple(dict_list[count]["Date"],book.datemode)
#print week number
if datetime.date.today().isocalendar()[1] == date(year, month, day).isocalendar()[1]:
TotalEffort = TotalEffort+dict_list[count]["Effort"]
weeknum = str(datetime.date.today().isocalendar()[1])
Total = str(TotalEffort)
print " Effort for week"+weeknum+" is: "+Total+"hours"
rb = open_workbook('output.xlsx')
ws = rb.sheet_by_index(0)
for rowidx in range(ws.nrows):# number of rows in sheets
row = ws.row(rowidx)# count row from 0 and get it frm sheet
for colidx, cell in enumerate(row):#read all rows in sheets
if cell.value == "search word":
print 'row ' ,rowidx
print 'column' ,colidx
cur_row = rowidx+2
cur_col = colidx+36
wb = copy(rb)
#pic first sheet
shw = wb.get_sheet(0)
value = str(Total)
#writing to shw
shw.write(cur_row,cur_col,'value')

django compare date with date

I trying to refuse importing lines with date lesser than already imported.
timelimit = Operation.objects.filter(account = 3).aggregate(Max('date'))
for row in csv.reader(reencode(f), delimiter=';', quotechar='"')
if row != []:
if row[0]>timelimit:
operation.date=row[0]
row looks like:
2012-01-12,something,0,something2
Of course comparison row[0]>timelimit is wrong - but what is correct?
#this will convert your string("2012-01-12") to a datetime object
from datetime import datetime
>>> x = datetime.strptime(row[0], "%Y-%m-%d")
>>> x
>>> datetime.datetime(2012, 1, 12, 0, 0)
And then you can convert timelimit in a datetime object too like so:
timelimit = datetime(2011, 10, 10)
and then comparing these two is trivial:
x > timelimit