I need to save/write a csv file from pandas dataframe in python. I have tried the following way;
import sys
import getopt
import os
def usage():
print "-f Please provide input file"
in1_flag = False
out_flag = True
inFile1 = ""
outFile1 = ""
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:o', ['input_file1=', 'out_file1='])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ('-i', '--input_file1'):
inFile1 = os.path.abspath(arg)
in1_flag = True
elif opt in ('-o', '--out_file1'):
outFile1 = os.path.abspath(arg)
split_h = []
with open(inFile1) as ff:
for line in ff:
split_h = line.split()
import pandas as pd
d1 = {'report': split_h}
df1 = pd.DataFrame(data = d1, columns=['report'])
df1.to_csv(outFile1, sep = '\t',header = False, index= False)
I thought it would be as easy as taking input but here I am stuck at writing file.
I have written a MapReduce code to calculate the mean of a set of values for each key. It works fine when I use just mapper and a reducer. But when I introduce a combiner in between to reduce the load on reducer, the keys are getting repeated in the result. TIA. Code is given below.
mapper.py:
#!/usr/bin/env python
import sys
from datetime import datetime
for line in sys.stdin:
data = line.strip().split("\t")
if(len(data) < 5):
continue
date, time, store, item, cost, payment = data
print("{0}\t{1}".format(datetime.strptime(date, "%Y-%m-%d").weekday(),cost))
combiner.py:
#!/usr/bin/env python
import sys
from datetime import datetime
oldKey = None
salesList = ""
for line in sys.stdin:
data = line.rstrip().split('\t')
thisKey, thisSale = data
if(oldKey and oldKey != thisKey):
print("{0}\t{1}".format(oldKey,salesList))
salesList = ""
else:
if(oldKey == thisKey):
if(salesList != ""):
salesList = salesList + ',' + thisSale
else:
salesList = thisSale
oldKey = thisKey
if(oldKey):
salesList = salesList + ',' + thisSale
print("{0}\t{1}".format(oldKey,salesList))
salesList = ""
reducer.py
#!/usr/bin/env python
import sys
from datetime import datetime
oldKey = None
meanSales = None
salesTotal = 0
count = 0
for line in sys.stdin:
data = line.rstrip().split('\t')
thisKey, thisSale = data
thisSaleList = thisSale.split(',')
thisSaleListFloat = list(map(float, thisSaleList))
meanSales = sum(thisSaleListFloat)/len(thisSaleListFloat)
print("{0}\t{1}".format(thisKey, meanSales))
Output
0 249.91917747419384
0 250.09807318775844
1 249.87984898663836
1 249.59593170284487
2 249.95321425419965
2 249.75339205149234
3 249.54634982922747
3 250.19731461573994
4 250.3129656082639
4 250.13323419720658
5 250.13367036331366
5 250.03468060131152
6 250.207532163134
6 249.67593639719652
I'm now doing a little project which uses celery to turn csv and xlsx files into postgresql table.
The code below works fine without celery(except large files),but after using celery it produce some errors and bugs.
I've looked for similar questions in StackOverFlow but don't have any idea how to do and why.
Hope you guys can help me with it,thanks.
First error is as follows:
csv-1
csv-2
I think it has something to do with my encoding part, but I tried to open it with utf-8-sig and big-5, not working.(It works fine without celery)
`
# -*- coding: utf-8 -*-
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.http import HttpResponseRedirect
from django.core.urlresolvers import reverse
from django.contrib import messages
from django.conf import settings
from django.db import connection
from django.views.decorators.csrf import csrf_exempt
from celery import Celery
from celery import task
import json
import csv
import sys
import random
import psycopg2
import xlrd
import openpyxl as pyxl
from .models import Document
from .forms import DocumentForm
app = Celery(
'tasks',
broker='amqp://guest:guest#localhost:5672//',
backend='rpc://'
)
CELERY_RESULT_BACKEND = 'rpc://'
CELERY_RESULT_PERSISTENT = False
#app.task()
def csvwritein(doc):# Transform csv to table
doc = doc
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
readcur = conn.cursor()
readcur.execute("select exists(select * from
information_schema.tables where table_name='%s')" % doc.tablename) # check if
same file is already in database
check = readcur.fetchone()[0]
try:
fr = open(doc.path,encoding = 'utf-8-sig')
dr.delay(fr,doc,check)
fr.close()
except Exception as e:
fr = open(doc.path,encoding = 'big5')
dr.delay(fr,doc,check)
fr.close()
conn.commit()
readcur.close()
#app.task()
def dr(fr,doc,check): # make datareader as function to keep code 'dry'
csvt = 0 #count csv reader loop time
row_id = 1 # used for following id field
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
maincur = conn.cursor()
writecur = conn.cursor()
datareader = csv.reader(fr, delimiter=',')
for row in datareader:
if csvt == 0: # first time in loop(create field) and check no
same file exists
if check == True:
app =
''.join([random.SystemRandom().choice('abcdefghijklmnopqrstuvwxyz0123456789')
for i in range(6)])
tname = '%s-%s' % (doc.tablename,app
tablename = '"%s-%s"' % (doc.tablename,app)
doc.tablename = tname
doc.save()
else:
tablename = '"%s"' % doc.tablename
maincur.execute("CREATE TABLE %s (id SERIAL PRIMARY
KEY);" % tablename)
row_count = sum(1 for line in datareader)
col_count = len(row)
frow = row
for i in range(0,col_count,1):
row[i] = '"%s"' % row[i] # change number to
string
maincur.execute("ALTER TABLE %s ADD %s
CITEXT;" % (tablename,row[i]))
csvt = csvt+1
fr.seek(0)
next(datareader)
elif csvt > 0: # not first time(insert data) and check no
same file exists
for j in range(0,col_count,1):
if j == 0:
writecur.execute("INSERT INTO %s (%s)
VALUES ('%s');" % (tablename,frow[j],row[j]))
else:
writecur.execute("UPDATE %s SET %s =
'%s' WHERE id = '%d';" %(tablename,frow[j],row[j],row_id))
csvt = csvt+1
row_id = row_id+1
else:
break
conn.commit()
maincur.close()
writecur.close()
conn.close()
csvt = 0
doc = Document.objects.all()
`
Second error is about turning a xlsx file(about 130,000 rows) into postgresql table, and the worker got sig-kill after 2-3 minutes.
Debug Message:
[2016-10-27 06:17:05,227: ERROR/MainProcess] Process 'Worker-1' pid:13829 exited with 'signal 9 (SIGKILL)' [2016-10-27 06:17:05,328:ERROR/MainProcess] Task data.tasks.xlsxwritein[5aec4679-c48b-4d07-a0a9-5e4e37fcd24b] raised unexpected: WorkerLostError('Worker exited prematurely: signal 9 (SIGKILL).',) Traceback (most recent call last): File "/usr/local/lib/python3.4/dist-packages/billiard/pool.py", line 1175, in mark_as_worker_lost human_status(exitcode)), billiard.exceptions.WorkerLostError: Worker exited prematurely: signal 9 (SIGKILL).
#The code continues from the above task.py file
#app.task()
def xlsxwritein(doc): # write into database for file type xlsx
xlsxt = 0
conn = psycopg2.connect("dbname='apidb' user='api' host='localhost'
password='eric40502' port='5432'")
maincur = conn.cursor()
readcur = conn.cursor()
writecur = conn.cursor()
readcur.execute("select exists(select * from
information_schema.tables where table_name='%s')" % doc.tablename) # check if
same file is already in database
check = readcur.fetchone()[0]
row_id = 1 # used for following id field
wb = pyxl.load_workbook(doc.path)
sheetnames = wb.get_sheet_names()
ws = wb.get_sheet_by_name(sheetnames[0])
for rown in range(ws.get_highest_row()):
if xlsxt == 0:
if check == True:
app =
''.join([random.SystemRandom().choice('abcdefghijklmnopqrstuvwxyz0123456789')
for i in range(6)])
tname = '%s-%s' % (doc.tablename,app)
tablename = '"%s-%s"' % (doc.tablename,app)
doc.tablename = tname
doc.save()
else:
tablename = '"%s"' % doc.tablename
field = [ws.cell(row=1,column=col_index).value for
col_index in range(1,ws.get_highest_column()+1)]
maincur.execute("CREATE TABLE %s (id SERIAL PRIMARY
KEY);" % tablename)
for coln in range(ws.get_highest_column()):
field[coln] = '"%s"' % field[coln] # change
number to string
if field[coln] == 'ID':
field[coln] = 'original_id'
maincur.execute("ALTER TABLE %s ADD %s
CITEXT;" % (tablename,field[coln]))
xlsxt = xlsxt+1
elif xlsxt > 0 and check == False: # not first time(insert
data) and check no same file exists
for coln in range(ws.get_highest_column()):
if coln == 0:
writecur.execute("INSERT INTO %s (%s)
VALUES ('%s');"
%(tablename,field[coln],str(ws.cell(row=rown,column=coln+1).value)))
else:
writecur.execute("UPDATE %s SET %s =
'%s' WHERE id = '%d';"
%(tablename,field[coln],str(ws.cell(row=rown+1,column=coln+1).value),row_id))
xlsxt = xlsxt+1
row_id = row_id+1
else:
break
conn.commit()
maincur.close()
readcur.close()
writecur.close()
conn.close()
xlsxt = 0
Probably something is going wrong during arguments deserialization. Instead of passing doc object try instead passing filename and then read file inside of task.
I updated the code and it now provides the graph, however after giving me the graph it produces the following error messages.
Warning (from warnings module):
File "C:\Python27\lib\site-packages\matplotlib\collections.py", line 590
if self._edgecolors == str('face'):
FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
import urllib2
import time
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import matplotlib.dates as mdates
from matplotlib.finance import candlestick_ochl
import matplotlib
import pylab
matplotlib.rcParams.update({'font.size': 9})
def rsiFunc(prices, n=14):
deltas = np.diff(prices)
seed = deltas[:n+1]
up = seed[seed>=0].sum()/n
down = -seed[seed<0].sum()/n
rs = up/down
rsi = np.zeros_like(prices)
rsi[:n] = 100. - 100./(1.+rs)
for i in range(n, len(prices)):
delta = deltas[i-1] # cause the diff is 1 shorter
if delta>0:
upval = delta
downval = 0.
else:
upval = 0.
downval = -delta
up = (up*(n-1) + upval)/n
down = (down*(n-1) + downval)/n
rs = up/down
rsi[i] = 100. - 100./(1.+rs)
return rsi
def movingaverage(values,window):
weigths = np.repeat(1.0, window)/window
smas = np.convolve(values, weigths, 'valid')
return smas # as a numpy array
def ExpMovingAverage(values, window):
weights = np.exp(np.linspace(-1., 0., window))
weights /= weights.sum()
a = np.convolve(values, weights, mode='full')[:len(values)]
a[:window] = a[window]
return a
def computeMACD(x, slow=26, fast=12):
"""
compute the MACD (Moving Average Convergence/Divergence) using a fast and slow exponential moving avg'
return value is emaslow, emafast, macd which are len(x) arrays
"""
emaslow = ExpMovingAverage(x, slow)
emafast = ExpMovingAverage(x, fast)
return emaslow, emafast, emafast - emaslow
def graphData(stock,MA1,MA2):
'''
Use this to dynamically pull a stock:
'''
try:
print 'Currently Pulling',stock
print str(datetime.datetime.fromtimestamp(int(time.time())).strftime('%Y-%m-%d %H:%M:%S'))
#Keep in mind this is close high low open data from Yahoo
urlToVisit = 'http://chartapi.finance.yahoo.com/instrument/1.0/'+stock+'/chartdata;type=quote;range=10y/csv'
stockFile =[]
try:
sourceCode = urllib2.urlopen(urlToVisit).read()
splitSource = sourceCode.split('\n')
for eachLine in splitSource:
splitLine = eachLine.split(',')
if len(splitLine)==6:
if 'values' not in eachLine:
stockFile.append(eachLine)
except Exception, e:
print str(e), 'failed to organize pulled data.'
except Exception,e:
print str(e), 'failed to pull pricing data'
try:
date, closep, highp, lowp, openp, volume = np.loadtxt(stockFile,delimiter=',', unpack=True,
converters={ 0: mdates.strpdate2num('%Y%m%d')})
x = 0
y = len(date)
newAr = []
while x < y:
appendLine = date[x],openp[x],closep[x],highp[x],lowp[x],volume[x]
newAr.append(appendLine)
x+=1
Av1 = movingaverage(closep, MA1)
Av2 = movingaverage(closep, MA2)
SP = len(date[MA2-1:])
fig = plt.figure(facecolor='#07000d')
ax1 = plt.subplot2grid((6,4), (1,0), rowspan=4, colspan=4, axisbg='#07000d')
candlestick_ochl(ax1, newAr[-SP:], width=.6, colorup='#53c156', colordown='#ff1717')#width=.6, plot_day_summary_ohlc
Label1 = str(MA1)+' SMA'
Label2 = str(MA2)+' SMA'
ax1.plot(date[-SP:],Av1[-SP:],'#e1edf9',label=Label1, linewidth=1.5)
ax1.plot(date[-SP:],Av2[-SP:],'#4ee6fd',label=Label2, linewidth=1.5)
ax1.grid(True, color='w')
ax1.xaxis.set_major_locator(mticker.MaxNLocator(10))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax1.yaxis.label.set_color("w")
ax1.spines['bottom'].set_color("#5998ff")
ax1.spines['top'].set_color("#5998ff")
ax1.spines['left'].set_color("#5998ff")
ax1.spines['right'].set_color("#5998ff")
ax1.tick_params(axis='y', colors='w')
plt.gca().yaxis.set_major_locator(mticker.MaxNLocator(prune='upper')) #gca()
ax1.tick_params(axis='x', colors='w')
plt.ylabel('Stock price and Volume')
maLeg = plt.legend(loc=9, ncol=2, prop={'size':7},
fancybox=True, borderaxespad=0.)
maLeg.get_frame().set_alpha(0.4)
textEd = plt.gca().get_legend().get_texts()#pylab.gca() changed to plt.gca()
plt.setp(textEd[0:5], color = 'w')#changed pylab.setp to plt.setp
volumeMin = 0
ax0 = plt.subplot2grid((6,4), (0,0), sharex=ax1, rowspan=1, colspan=4, axisbg='#07000d')
rsi = rsiFunc(closep)
rsiCol = '#c1f9f7'
posCol = '#386d13'
negCol = '#8f2020'
ax0.plot(date[-SP:], rsi[-SP:], rsiCol, linewidth=1.5)
ax0.axhline(70, color=negCol)
ax0.axhline(30, color=posCol)
ax0.fill_between(date[-SP:], rsi[-SP:], 70, where=(rsi[-SP:]>=70), facecolor=negCol, edgecolor=negCol, alpha=0.5)
ax0.fill_between(date[-SP:], rsi[-SP:], 30, where=(rsi[-SP:]<=30), facecolor=posCol, edgecolor=posCol, alpha=0.5)
ax0.set_yticks([30,70])
ax0.yaxis.label.set_color("w")
ax0.spines['bottom'].set_color("#5998ff")
ax0.spines['top'].set_color("#5998ff")
ax0.spines['left'].set_color("#5998ff")
ax0.spines['right'].set_color("#5998ff")
ax0.tick_params(axis='y', colors='w')
ax0.tick_params(axis='x', colors='w')
plt.ylabel('RSI')
ax1v = ax1.twinx()
ax1v.fill_between(date[-SP:],volumeMin, volume[-SP:], facecolor='#00ffe8', alpha=.4)
ax1v.axes.yaxis.set_ticklabels([])
ax1v.grid(False)
ax1v.set_ylim(0, 3*volume.max())
ax1v.spines['bottom'].set_color("#5998ff")
ax1v.spines['top'].set_color("#5998ff")
ax1v.spines['left'].set_color("#5998ff")
ax1v.spines['right'].set_color("#5998ff")
ax1v.tick_params(axis='x', colors='w')
ax1v.tick_params(axis='y', colors='w')
ax2 = plt.subplot2grid((6,4), (5,0), sharex=ax1, rowspan=1, colspan=4, axisbg='#07000d')
# START NEW INDICATOR CODE #
# END NEW INDICATOR CODE #
plt.gca().yaxis.set_major_locator(mticker.MaxNLocator(prune='upper'))
ax2.spines['bottom'].set_color("#5998ff")
ax2.spines['top'].set_color("#5998ff")
ax2.spines['left'].set_color("#5998ff")
ax2.spines['right'].set_color("#5998ff")
ax2.tick_params(axis='x', colors='w')
ax2.tick_params(axis='y', colors='w')
ax2.yaxis.set_major_locator(mticker.MaxNLocator(nbins=5, prune='upper'))
for label in ax2.xaxis.get_ticklabels():
label.set_rotation(45)
plt.suptitle(stock.upper(),color='w')
plt.setp(ax0.get_xticklabels(), visible=False)
plt.setp(ax1.get_xticklabels(), visible=False)
'''ax1.annotate('Big news!',(date[510],Av1[510]),
xytext=(0.8, 0.9), textcoords='axes fraction',
arrowprops=dict(facecolor='white', shrink=0.05),
fontsize=14, color = 'w',
horizontalalignment='right', verticalalignment='bottom')'''
plt.subplots_adjust(left=.09, bottom=.14, right=.94, top=.95, wspace=.20, hspace=0)
plt.show()
fig.savefig('example.png',facecolor=fig.get_facecolor())
except Exception,e:
print 'main loop',str(e)
while True:
stock = raw_input('Stock to plot: ')
graphData(stock,10,50)
Please look at the thread Violin plot: warning with matplotlib 1.4.3 and pyplot fill_between warning since upgrade of numpy to 1.10.10
It seems there is a bug in matplotlib 1.4.3 (which has only started causing that error since the upgrade to numpy 1.10). This is reportedly corrected in 1.5.0 (which should be released soon). Hope this helps.
i made some code where i need to make a plot where my data is persed to moving average
import numpy as np
import csv
import datetime
import matplotlib.pyplot as plt
#Open Data/File
data = open('iphonevsandroid.csv', 'r')
reader = csv.reader(data, delimiter=',')
#Define lists
iphone_data = []
android_data = []
dateTime = []
stringdates = []
#iphone_data_average = []
#android_data_average = []
for row in reader:
first_date_row = row[0]
first_date = row[0][:-13]
if row[1] != 'iphone':
iphone_data.append(int(row[1]))
if row[2] != 'android':
android_data.append(int(row[2]))
if row[0] != 'week':
stringdates.append(row[0][:-13])
for item in stringdates:
dateTime.append(datetime.datetime.strptime(item, '%Y-%m-%d'))
def movingaverage(values,window):
weigths = np.repeat(1.0, window)/window
#including valid will REQUIRE there to be enough datapoints.
#for example, if you take out valid, it will start # point one,
#not having any prior points, so itll be 1+0+0 = 1 /3 = .3333
smas = np.convolve(values, weigths, 'valid')
return smas # as a numpy array
movingaverage(iphone_data,3)
movingaverage(android_data,3)
plt.ylabel('Indsæt y label')
plt.xlabel('Indsæt x label')
plt.plot(dateTime,movingaverage(iphone_data,3)+2)
plt.plot(dateTime,movingaverage(android_data,3)+2)
plt.show()
My problem is that i get this error: ValueError: x and y must have same first dimension.
I know its because of the len of the values,
if i print the len of:
print len(dateTime)
print len(movingaverage(iphone_data,3))
print len(movingaverage(android_data,3))
i get:
528
526
526
How do i get dateTime to 526???
smas = np.convolve(values, weigths, 'valid')
should be
smas = np.convolve(values, weigths, 'same')
and if you don't want the border values, then you will have to remove them yourself, that is for odd window lengths:
smas = np.convolve(values, weigths, 'valid')[(window-1)/2:-(window-1)/2]
Note that you would also have to remove these values from android_data and iphone_data.