I need to save/write a csv file from pandas dataframe in python. I have tried the following way;
import sys
import getopt
import os
def usage():
print "-f Please provide input file"
in1_flag = False
out_flag = True
inFile1 = ""
outFile1 = ""
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:o', ['input_file1=', 'out_file1='])
except getopt.GetoptError:
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ('-i', '--input_file1'):
inFile1 = os.path.abspath(arg)
in1_flag = True
elif opt in ('-o', '--out_file1'):
outFile1 = os.path.abspath(arg)
split_h = []
with open(inFile1) as ff:
for line in ff:
split_h = line.split()
import pandas as pd
d1 = {'report': split_h}
df1 = pd.DataFrame(data = d1, columns=['report'])
df1.to_csv(outFile1, sep = '\t',header = False, index= False)
I thought it would be as easy as taking input but here I am stuck at writing file.
Related
I have written a MapReduce code to calculate the mean of a set of values for each key. It works fine when I use just mapper and a reducer. But when I introduce a combiner in between to reduce the load on reducer, the keys are getting repeated in the result. TIA. Code is given below.
mapper.py:
#!/usr/bin/env python
import sys
from datetime import datetime
for line in sys.stdin:
data = line.strip().split("\t")
if(len(data) < 5):
continue
date, time, store, item, cost, payment = data
print("{0}\t{1}".format(datetime.strptime(date, "%Y-%m-%d").weekday(),cost))
combiner.py:
#!/usr/bin/env python
import sys
from datetime import datetime
oldKey = None
salesList = ""
for line in sys.stdin:
data = line.rstrip().split('\t')
thisKey, thisSale = data
if(oldKey and oldKey != thisKey):
print("{0}\t{1}".format(oldKey,salesList))
salesList = ""
else:
if(oldKey == thisKey):
if(salesList != ""):
salesList = salesList + ',' + thisSale
else:
salesList = thisSale
oldKey = thisKey
if(oldKey):
salesList = salesList + ',' + thisSale
print("{0}\t{1}".format(oldKey,salesList))
salesList = ""
reducer.py
#!/usr/bin/env python
import sys
from datetime import datetime
oldKey = None
meanSales = None
salesTotal = 0
count = 0
for line in sys.stdin:
data = line.rstrip().split('\t')
thisKey, thisSale = data
thisSaleList = thisSale.split(',')
thisSaleListFloat = list(map(float, thisSaleList))
meanSales = sum(thisSaleListFloat)/len(thisSaleListFloat)
print("{0}\t{1}".format(thisKey, meanSales))
Output
0 249.91917747419384
0 250.09807318775844
1 249.87984898663836
1 249.59593170284487
2 249.95321425419965
2 249.75339205149234
3 249.54634982922747
3 250.19731461573994
4 250.3129656082639
4 250.13323419720658
5 250.13367036331366
5 250.03468060131152
6 250.207532163134
6 249.67593639719652
I was writing a script in python using google-cloud-datastore python module to upload data from my CSV to datastore. The script seems to work fine but There seems to be a problem that I'm stuck with. I see that the integer values from my CSV are being stored as Floating point number. Is it a default way of sending data to datastore or am I doing something wrong?
Here's my code:
import sys
import getopt
import pandas as pd
from google.cloud import datastore
def write_dict_chunks(data, SIZE=100):
log_count = 0
datastore_client = datastore.Client()
task_key = datastore_client.key(kind)
for i in xrange(0, len(data), SIZE):
entities = []
for each_entry in data[i : i+SIZE]:
nan_check = lambda v: v if str(v)!='nan' else None
string_check = lambda v: v.decode('utf-8') if isinstance(v, str) else v
write_row = {k: nan_check(string_check(v)) for k, v in each_entry.iteritems()}
entity = datastore.Entity(key=task_key)
entity.update(write_row)
entities.append(entity)
datastore_client.put_multi(entities)
log_count += len(entities)
print 'Wrote {} entities to datastore'.format(log_count)
try:
opts, args = getopt.getopt(sys.argv[1:], "ho:v", ["kind=", "filepath="])
if len(args) > 0:
for each in args:
print 'Unrecognized argument: '+each
sys.exit(2)
except getopt.GetoptError as err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
print 'Usage: python parse_csv.py --kind=kind_name --filepath=path_to_csv'
kind = None
filepath = None
for option, argument in opts:
if option in '--kind':
kind = argument
elif option in '--filepath':
filepath = argument
df = pd.read_csv(filepath)
df = df.to_dict(orient='records')
write_dict_chunks(df)
So I have created this code for my research, but I want to use it for plenty of data files, I do not want to do it manually, which means retyping some lines in my code to use desired file. How to use input command in python (I work with python 2.7 on Windows OS) to use it faster, just by typing name of desired datafile. My code so far:
import iodata as io
import matplotlib.pyplot as plt
import numpy as np
import time
from scipy.signal import welch
from scipy import signal
testInstance = io.InputConverter()
start = time.time()
conversionError = io.ConversionError()
#data = testInstance.convert(r"S:\Doktorat\Python\", 1", conversionError)
data = testInstance.convert(r"/Users/PycharmProjects/Hugo/20160401", "201604010000", conversionError)
end = time.time()
print("time elapsed " + str(end - start))
if(conversionError.conversionSucces):
print("Conversion succesful")
if(conversionError.conversionSucces == False):
print("Conversion failed: " + conversionError.conversionErrorLog)
print "Done!"
# Create a new subplot for two cannals 1 & 3
a = np.amin(data.data)
Bx = data.data[0,]
By = data.data[1,]
dt = float(300)/266350
Fs = 1/dt
t = np.arange(0,300,dt*1e3)
N = len(Bx)
M = len(By)
time = np.linspace(0,300,N)
time2 = np.linspace(0,300,M)
filename = 'C:/Users/PycharmProjects/Hugo/20160401/201604010000.dat'
d = open(filename,'rb')
degree = u"\u00b0"
headersize = 64
header = d.read(headersize)
ax1 = plt.subplot(211)
ax1.set_title(header[:16] + ', ' + # station name
'Canals: '+header[32:33]+' and '+header[34:35]+ ', ' # canals
+'Temp'+header[38:43]+degree+'C' # temperature
+', '+'Time:'+header[26:32]+', '+'Date'+' '+header[16:26]) # date
plt.ylabel('Pico Tesle [pT]')
plt.xlabel('Time [ms]')
plt.grid()
plt.plot(time[51:-14], Bx[51:-14], label='Canal 1', color='r', linewidth=0.1, linestyle="-")
plt.plot(time2[1:-14], By[1:-14], label='Canal 3', color='b', linewidth=0.1, linestyle="-")
plt.legend(loc='upper right', frameon=False, )
# Create a new subplot for FFT
plt.subplot(212)
plt.title('Fast Fourier Transform')
plt.ylabel('Power [a.u.]')
plt.xlabel('Frequency Hz')
xaxis2 = np.arange(0,470,10)
plt.xticks(xaxis2)
fft1 = (Bx[51:-14])
fft2 = (By[1:-14])
plt.grid()
# Loop for FFT data
for dataset in [fft1]:
dataset = np.asarray(dataset)
freqs, psd = welch(dataset, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs, psd/dataset.size**0, color='r')
for dataset2 in [fft2]:
dataset2 = np.asarray(dataset2)
freqs2, psd2 = welch(dataset2, fs=266336/300, window='hamming', nperseg=8192)
plt.semilogy(freqs2, psd2/dataset2.size**0, color='b')
plt.show()
As you can see there are some places where it would be better to put input and when I run the code I can write names of filenames etc. to python instead of creating every single pythonfile, with specified info in the code.
Btw. I use Pycharm to my python.
If all you are trying to do is get rid of the hardcoded pathname, you should be able to format your name string with input variables
name = raw_input("Name: ")
measurement = raw_input("Measurement: ")
filename = "C:/Users/PycharmProjects/{0}/{1}".format(name, measurement)
see raw_input and string formatting
I am trying to convert the xls into csv file using pandas in python. But I am getting the following error like 'XLRDError: No sheet named <'Sheet1'>'. I have verified the sheet name and it is same as specified above, but I don't how to correct this error. Please find my code below.
CODE:
def xls_2_csv():
import pandas as pd
data_xls = pd.read_excel(r'c:\delivery\file1.xls','Sheet1', index_col=None)
data_xls.to_csv(r'C:\test\file1.csv', encoding='utf-8',index=None)
xls_2_csv()
Please help me in solving this error. Thanks in advance.
I found the same problem in python 3.6 and pandas version is 0.25.1.
The following should work:
import pandas as pd
file = 'your excel file path'
# the file is endswith '.xls' and there is multiple sheets
# error method
df_sheet1 = pd.read_excel(file, sheet_name='Sheet1')
df_sheet2 = pd.read_excel(file, sheet_name='Sheet2')
# when read Sheet1 had no error, but when read Sheet2, had an error:
# xlrd.biffh.XLRDError: No sheet named <'Sheet2'>
# right method
with pd.ExcelFile(file) as xls:
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
print(df.head())
Hi I tried the following code it worked for me.
CODE:
import logging
import time
import traceback
import xlrd
import csv
import sys
import re
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
xls = input file path
target = output file path
logging.info("Start converting: From '" + xls + "' to '" + target + "'. ")
try:
start_time = time.time()
wb = xlrd.open_workbook(xls)
sh = wb.sheet_by_index(0)
csvFile = open(target, 'wb')
wr = csv.writer(csvFile, quoting=csv.QUOTE_ALL)
for row in xrange(sh.nrows):
rowValues = sh.row_values(row)
newValues = []
for s in rowValues:
if isinstance(s, unicode):
strValue = (str(s.encode("utf-8")))
else:
strValue = (str(s))
isInt = bool(re.match("^([0-9]+)\.0$", strValue))
if isInt:
strValue = int(float(strValue))
else:
isFloat = bool(re.match("^([0-9]+)\.([0-9]+)$", strValue))
isLong = bool(re.match("^([0-9]+)\.([0-9]+)e\+([0-9]+)$", strValue))
if isFloat:
strValue = float(strValue)
if isLong:
strValue = int(float(strValue))
newValues.append(strValue)
wr.writerow(newValues)
csvFile.close()
logging.info("Finished in %s seconds", time.time() - start_time)
except Exception as e:
print (str(e) + " " + traceback.format_exc())
First of all I am pretty new at python, so bear with me. I am attempting to read from one file, retrieve specific values and overwrite old values in another file with a similar format. The format is 'text value=xxx' in both files. I have the first half of the program working, I can extract the values I want and have placed them into a dict named 'params{}'. The part I haven't figured out is how to just write the specific value into the target file without it showing up at the end of the file or just writing garbage or only half of the file. Here is my source code so far:
import os, os.path, re, fileinput, sys
#set the path to the resource files
#res_files_path = r'C:\Users\n518013\Documents\203-104 WA My MRT Files\CIA Data\pelzer_settings'
tst_res_files_path = r'C:\resource'
# Set path to target files.
#tar_files_path = r'C:\Users\n518013\Documents\203-104 WA My MRT Files\CIA Data\CIA3 Settings-G4'
tst_tar_files_path = r'C:\target'
#test dir.
test_files_path = r'C:\Users\n518013\Documents\MRT Equipment - BY 740-104 WA\CIA - AS\Setting Files\305_70R_22_5 setting files\CIA 1 Standard'
# function1 to find word index and point to value
def f_index(lst, item):
ind = lst.index(item)
val = lst[ind + 3]
print val
return val
# function 2 for values only 1 away from search term
def f_index_1(lst, item):
ind = lst.index(item)
val = lst[ind + 1]
return val
# Create file list.
file_lst = os.listdir(tst_res_files_path)
# Traverse the file list and read in dim settings files.
# Set up dict.
params = {}
#print params
for fname in file_lst:
file_loc = os.path.join(tst_res_files_path, fname)
with open(file_loc, 'r') as f:
if re.search('ms\.', fname):
print fname
break
line = f.read()
word = re.split('\W+', line)
print word
for w in word:
if w == 'Number':
print w
params['sectors'] = f_index(word, w)
elif w == 'Lid':
params['lid_dly'] = f_index(word, w)
elif w == 'Head':
params['rotation_dly'] = f_index(word, w)
elif w == 'Horizontal':
tmp = f_index_1(word, w)
param = int(tmp) + 72
params['horizontal'] = str(param)
elif w == 'Vertical':
tmp = f_index_1(word, w)
param = int(tmp) - 65
params['vertical'] = str(param)
elif w == 'Tilt':
params['tilt'] = f_index_1(word, w)
else:
print 'next...'
print params #this is just for debugging
file_tar = os.path.join(tst_tar_files_path, fname)
for lines in fileinput.input(file_tar, inplace=True):
print lines.rstrip()
if lines.startswith('Number'):
if lines[-2:-1] != params['sectors']:
repl = params['sectors']
lines = lines.replace(lines[-2:-1], repl)
sys.stdout.write(lines)
else:
continue
Sample text files:
[ADMINISTRATIVE SETTINGS]
SettingsType=SingleScan
DimensionCode=
Operator=
Description=rev.1 4sept03
TireDimClass=Crown
TireWidth=400mm
[TEST PARAMETERS]
Number Of Sectors=9
Vacuum=50
[DELAY SETTINGS]
Lid Close Delay=3
Head Rotation Delay=3
[HEAD POSITION]
Horizontal=140
Vertical=460
Tilt=0
[CALIBRATION]
UseConvFactors=0
LengthUnit=0
ConvMMX=1
ConvPixelX=1
CorrFactorX=1
ConvMMY=1
ConvPixelY=1
CorrFactorY=1
end sample txt.
The code I have only writes about half of the file back, and I don't understand why? I am trying to replace the line 'Number of Sectors=9' with 'Number of Sectors=8' if I could get this to work, the rest of the replacements can be done using if statements.
Please help! I've spent hours on google looking for answers and info and everything I find gets me close but no cigar!
Thank you all in advance!
your file has the '.ini' format. python supports reading and writing those with the ConfigParser module. you could do this:
# py3: from pathlib import Path
import os.path
import configparser
# py3: IN_PATH = Path(__file__).parent / '../data/sample.ini'
# py3: OUT_PATH = Path(__file__).parent / '../data/sample_out.ini'
HERE = os.path.dirname(__file__)
IN_PATH = os.path.join(HERE, '../data/sample.ini')
OUT_PATH = os.path.join(HERE, '../data/sample_out.ini')
config = configparser.ConfigParser()
# py3: config.read(str(IN_PATH))
config.read(IN_PATH)
print(config['CALIBRATION']['LengthUnit'])
config['CALIBRATION']['LengthUnit'] = '27'
# py3: with OUT_PATH.open('w') as fle:
with open(OUT_PATH, 'w') as fle:
config.write(fle)