csv files in python - python-2.7

i am working on a machine leaning project and here is my code
import csv
import numpy as np
import string
from sklearn.ensemble import RandomForestRegressor
def main():
alchemy_category_set = {}
#read train data
train = []
target = []
with open("/media/halawa/93B77F681EC1B4D2/GUC/Semster 8/CSEN 1022 Machine Learning/2/train.csv", 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
reader.next() #skip the header
for row in reader:
line = row[3:len(row)-1]
train.append(line)
target.append(row[len(row)-1])
if row[3] not in alchemy_category_set:
alchemy_category_set[row[3]] = len(alchemy_category_set)
#read valid data
valid = []
valid_index = []
with open("/media/halawa/93B77F681EC1B4D2/GUC/Semster 8/CSEN 1022 Machine Learning/2/test.csv", 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
reader.next() #skip the header
for row in reader:
line = row[3:len(row)]
valid.append(line)
valid_index.append(row[1])
if row[3] not in alchemy_category_set:
alchemy_category_set[row[3]] = len(alchemy_category_set)
if __name__=="__main__":
main()
the reading of the test.csv is not working although it is working with the traing,csv , when i run it gives me
/usr/bin/python2.7 /home/halawa/PycharmProjects/ML/train.py
Traceback (most recent call last):
File "/home/halawa/PycharmProjects/ML/train.py", line 68, in <module>
main()
File "/home/halawa/PycharmProjects/ML/train.py", line 26, in main
reader.next() #skip the header
StopIteration
Process finished with exit code 1
the problem is with reading the csv file , any help would be appreciated .

I think you just forgot indentation after opening test file. Namely, after with open line the next 8 lines (each of these lines) should be indented with 2 more space .
By the way, it is highly recommended to indent with 4 spaces, not just 2.
And it should be consistent in your file

Related

UnicodeDecodeError in Python Classification Arabic Datasets

I have Arabic datasets for classification using Python; two directories (negative and positive) in a Twitter directory.
I want to use Python classes to classify the data. When I run the attached code, this error occurs:
>
File "C:\Users\DEV2016\Anaconda2\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xc7 in position 0: invalid continuation byte
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
dir_path = "E:\Twitter\Twitter"
# Loading files into memory
files = sklearn.datasets.load_files(dir_path)
# Calculating BOW
count_vector = sklearn.feature_extraction.text.CountVectorizer()
word_counts=count_vector.fit_transform(files.data)
# Calculating TFIDF
tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
X = tf_transformer.transform(word_counts)
# Create classifier
# clf = sklearn.naive_bayes.MultinomialNB()
# clf = sklearn.svm.LinearSVC()
n_neighbors = 11
weights = 'distance'
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
# Test the classifier
# Train-test split
test_size=0.4
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, files.target, test_size=test_size)
# Test classifier
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print (sklearn.metrics.classification_report(y_test, y_predicted,
target_names=files.target_names))
print ('Confusion Matrix:')
print (sklearn.metrics.confusion_matrix(y_test, y_predicted))
Traceback
File "<ipython-input-19-8ea269fd9c3d>", line 1, in <module>
runfile('C:/Users/DEV2016/.spyder/clf.py', wdir='C:/Users/DEV2016/.spyder')
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/DEV2016/.spyder/clf.py", line 18, in <module>
word_counts=count_vector.fit_transform(files.data)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab
for feature in analyze(doc):
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 266, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 116, in decode
doc = doc.decode(self.encoding, self.decode_error)
File "C:\Users\DEV2016\Anaconda2\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xc7 in position 0:
invalid continuation byte
In the Twitter data you are trying to load, there are characters that are not recognized by utf-8. Try to load it with other encoding formats like
files = sklearn.datasets.load_files(dir_path, encoding="iso-8859-1")

ValueError: I/O operation

I see following error when executing this python code. What is issue here?
I have used "sys.stdout.close()" still I see these errors.
#! /usr/bin/python
import sys
a = [ 10, 12, 13, 14]
sys.stdout=open("file.txt","w")
print("++++++++")
print("***xyz***")
print("++++++++")
sys.stdout.close()
for i in a:
print i
Output:
Traceback (most recent call last):
File "./test3.py", line 10, in <module>
print i
ValueError: I/O operation on closed file`
You are trying to write to stdout (your file) after closing it. At line 8 you close the file, and at line 10 you call print.
If you want to write the list a to the file you should close it after the for loop.
Consider using with open because you don't have to worry about closing it. If your list needs to be a list then consider pickling it instead of writing it to a file. Pickling serializes your data.
#!python3
# import module
from os import system
import pickle
# clear the screan
system('cls')
a = [ 10, 12, 13, 14]
# write a list to file, but it has to be written as a string
with open('file.txt', 'w') as wf:
wf.write(str(a))
# when you open your file up, the data is a string
with open('file.txt', 'r') as fp:
for item in fp:
print(item)
print(type(item))
# if you want to retain your data as a list, then pickle it
output = open('file.pkl', 'wb')
pickle.dump(a, output)
output.close()
# open up a pickled file
pkl_file = open('file.pkl', 'rb')
data = pickle.load(pkl_file)
print(data)
print(type(data))
pkl_file.close()

Getting Error when reading a file

I'm getting a simple but confusing error when trying to create a login worker
via python.
Here's the error i'm getting.
Traceback (most recent call last):
File "stratixlogin.py", line 87, in <module>
main()
File "stratixlogin.py", line 78, in main
login_worker()
File "stratixlogin.py", line 51, in login_worker
data = f.read()
ValueError: Mixing iteration and read methods would lose datanter code
Here is where the Error is occuring:
with open("global_users.txt", "r") as f:
for line in f:
data = f.read()
if data == username_ask:
print(G+"Success!")
password_ask = raw_input(O+"Password:"+W+" ")
with open("global_passwords.txt", "r") as f:
for line in f:
data = f.read()
if data == password_ask:
print(G+"Success!")
else:
print(R+"Incorrect Password!")
else:
print(R+"No Users Found!")
I am not sure what the error is here, But i am confused on how to fix this. Any Ideas?
You can't mix iterating through the lines of the file (the for loop) and read().
This is enough:
with open("global_users.txt", "r") as f:
for data in f:
if data == username_ask:
print(G+"Success!")
password_ask = raw_input(O+"Password:"+W+" ")
with open("global_passwords.txt", "r") as f:
for line in f:
data = f.read()
if data == password_ask:
print(G+"Success!")
else:
print(R+"Incorrect Password!")
else:
print(R+"No Users Found!")

Memory error even though RAM is free

I am merging files together in 4 folders. Within those 4 folders I am merging 80 .dbf files together each of which is 35 megabytes. I am using the following code:
import os
import pandas as pd
from simpledbf import Dbf5
list1=[]
folders=r'F:\dbf_tables'
out=r'F:\merged'
if not os.path.isdir(out):
os.mkdir(out)
for folder in os.listdir(folders):
if not os.path.isdir(os.path.join(out,folder)):
os.mkdir(os.path.join(out,folder))
for f in os.listdir(os.path.join(folders,folder)):
if '.xml' not in f:
if '.cpg' not in f:
table=Dbf5(os.path.join(folders,folder,f))
df=table.to_dataframe()
list1.append(df)
dfs = reduce(lambda left,right: pd.merge(left,right,on=['POINTID'],how='outer',),list1)
dfs.to_csv(os.path.join(out,folder,'combined.csv'), index=False)
almost immediately after running the code I receive this error:
Traceback (most recent call last):
File "<ipython-input-1-77eb6fd0cda7>", line 1, in <module>
runfile('F:/python codes/prelim_codes/raster_to_point.py', wdir='F:/python codes/prelim_codes')
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 714, in runfile
execfile(filename, namespace)
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\spyderlib\widgets\externalshell\sitecustomize.py", line 74, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "F:/python codes/prelim_codes/raster_to_point.py", line 66, in <module>
dfs = reduce(lambda left,right: pd.merge(left,right,on=['POINTID'],how='outer',),list1)
File "F:/python codes/prelim_codes/raster_to_point.py", line 66, in <lambda>
dfs = reduce(lambda left,right: pd.merge(left,right,on=['POINTID'],how='outer',),list1)
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\pandas\tools\merge.py", line 39, in merge
return op.get_result()
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\pandas\tools\merge.py", line 217, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\pandas\tools\merge.py", line 353, in _get_join_info
sort=self.sort, how=self.how)
File "C:\Users\spotter\AppData\Local\Continuum\Anaconda_64\lib\site-packages\pandas\tools\merge.py", line 559, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas\src\join.pyx", line 160, in pandas.algos.full_outer_join (pandas\algos.c:61256)
MemoryError
but only 30% of my memory is being used, which is pretty much the baseline.
EDIT:
I picked out only 2 files and tried the merge using:
merge=pd.merge(df1,df2, on=['POINTID'], how='outer')
and still get a memory error, something weird is going on.
When I run the same thing in 32-bit Anaconda I get ValueError: negative dimensions are not allowed
EDIT:
The entire problem stemmed from the solution give here:
Value Error: negative dimensions are not allowed when merging
EDITED based on comment:
Try this (it's enough to use only one if statement with logical and conditions):
import os
import pandas as pd
from simpledbf import Dbf5
folders = r'F:\dbf_tables'
out = r'F:\merged'
if not os.path.isdir(out):
os.mkdir(out)
for folder in os.listdir(folders):
if not os.path.isdir(os.path.join(out, folder)):
os.mkdir(os.path.join(out, folder))
# Initialize empty dataframe by folders
dfs = pd.DataFrame(columns=['POINTID'])
for f in os.listdir(os.path.join(folders, folder)):
if ('.xml' not in f) and ('.cpg' not in f):
table = Dbf5(os.path.join(folders, folder, f))
df = table.to_dataframe()
# Merge actual dataframe to result dataframe
dfs = dfs.merge(df, on=['POINTID'], how='outer')
# Save results by folder
dfs.to_csv(os.path.join(out, folder, 'combined.csv'), index=False)

NLTK python tokenizing a CSV file

I have began to experiment with Python and NLTK.
I am experiencing a lengthy error message which I cannot find a solution to and would appreciate any insights you may have.
import nltk,csv,numpy
from nltk import sent_tokenize, word_tokenize, pos_tag
reader = csv.reader(open('Medium_Edited.csv', 'rU'), delimiter= ",",quotechar='|')
tokenData = nltk.word_tokenize(reader)
I'm running Python 2.7 and the latest nltk package on OSX Yosemite.
These are also two lines of code I attempted with no difference in results:
with open("Medium_Edited.csv", "rU") as csvfile:
tokenData = nltk.word_tokenize(reader)
These are the error messages I see:
Traceback (most recent call last):
File "nltk_text.py", line 11, in <module>
tokenData = nltk.word_tokenize(reader)
File "/Library/Python/2.7/site-packages/nltk/tokenize/__init__.py", line 101, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "/Library/Python/2.7/site-packages/nltk/tokenize/__init__.py", line 86, in sent_tokenize
return tokenizer.tokenize(text)
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/Library/Python/2.7/site-packages/nltk/tokenize/punkt.py", line 1278, in _slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or buffer
Thanks in advance
As you can read in the Python csv documentation, csv.reader "returns a reader object which will iterate over lines in the given csvfile". In other words, if you want to tokenize the text in your csv file, you will have to go through the lines and the fields in those lines:
for line in reader:
for field in line:
tokens = word_tokenize(field)
Also, when you import word_tokenize at the beginning of your script, you should call it as word_tokenize, and not as nltk.word_tokenize. This also means you can drop the import nltk statement.
It is giving error - expected string or buffer because you have forgotten to add str as
tokenData = nltk.word_tokenize(str(reader))