UnicodeDecodeError in Python Classification Arabic Datasets - python-2.7

I have Arabic datasets for classification using Python; two directories (negative and positive) in a Twitter directory.
I want to use Python classes to classify the data. When I run the attached code, this error occurs:
>
File "C:\Users\DEV2016\Anaconda2\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xc7 in position 0: invalid continuation byte
import sklearn.datasets
import sklearn.metrics
import sklearn.cross_validation
import sklearn.svm
import sklearn.naive_bayes
import sklearn.neighbors
dir_path = "E:\Twitter\Twitter"
# Loading files into memory
files = sklearn.datasets.load_files(dir_path)
# Calculating BOW
count_vector = sklearn.feature_extraction.text.CountVectorizer()
word_counts=count_vector.fit_transform(files.data)
# Calculating TFIDF
tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
X = tf_transformer.transform(word_counts)
# Create classifier
# clf = sklearn.naive_bayes.MultinomialNB()
# clf = sklearn.svm.LinearSVC()
n_neighbors = 11
weights = 'distance'
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
# Test the classifier
# Train-test split
test_size=0.4
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, files.target, test_size=test_size)
# Test classifier
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)
print (sklearn.metrics.classification_report(y_test, y_predicted,
target_names=files.target_names))
print ('Confusion Matrix:')
print (sklearn.metrics.confusion_matrix(y_test, y_predicted))
Traceback
File "<ipython-input-19-8ea269fd9c3d>", line 1, in <module>
runfile('C:/Users/DEV2016/.spyder/clf.py', wdir='C:/Users/DEV2016/.spyder')
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/DEV2016/.spyder/clf.py", line 18, in <module>
word_counts=count_vector.fit_transform(files.data)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 869, in fit_transform
self.fixed_vocabulary_)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 792, in _count_vocab
for feature in analyze(doc):
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 266, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "C:\Users\DEV2016\Anaconda2\lib\site-
packages\sklearn\feature_extraction\text.py", line 116, in decode
doc = doc.decode(self.encoding, self.decode_error)
File "C:\Users\DEV2016\Anaconda2\lib\encodings\utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xc7 in position 0:
invalid continuation byte

In the Twitter data you are trying to load, there are characters that are not recognized by utf-8. Try to load it with other encoding formats like
files = sklearn.datasets.load_files(dir_path, encoding="iso-8859-1")

Related

Python Pandas 'str' object is not callable

I am new to Python and was trying the Pandas library. Here is the code to read a CSV file without headers:
import pandas as pnd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
pnd.set_option('max_columns', 50)
mpl.rcParams['lines.linewidth'] = 2
headers = ['OrderId', 'OrderDate', 'UserId', 'TotalCharges']
dtypes = {'OrderId': 'int', 'OrderDate': 'str', 'UserId': 'int', 'TotalCharges':'float'}
parse_dates = ['OrderDate']
df = pnd.read_csv('Raw_flight_data.csv', sep='\t', header=None,
names=headers,converters=dtypes,parse_dates=parse_dates)
This code gives me an error :-
runfile('C:/Users/rohan.arora/Desktop/Python/example.py', wdir='C:/Users/rohan.arora/Desktop/Python')
Traceback (most recent call last):
File "<ipython-input-47-43fc22883149>", line 1, in <module>
runfile('C:/Users/rohan.arora/Desktop/Python/example.py', wdir='C:/Users/rohan.arora/Desktop/Python')
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\spyder\utils\site\sitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/rohan.arora/Desktop/Python/example.py", line 13, in <module>
names=headers,converters=dtypes,parse_dates=parse_dates)
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 646, in parser_f
return _read(filepath_or_buffer, kwds)
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 401, in _read
data = parser.read()
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 939, in read
ret = self._engine.read(nrows)
File "C:\Users\rohan.arora\Anaconda2\lib\site-packages\pandas\io\parsers.py", line 1508, in read
data = self._reader.read(nrows)
File "pandas\parser.pyx", line 848, in pandas.parser.TextReader.read (pandas\parser.c:10415)
File "pandas\parser.pyx", line 870, in pandas.parser.TextReader._read_low_memory (pandas\parser.c:10691)
File "pandas\parser.pyx", line 947, in pandas.parser.TextReader._read_rows (pandas\parser.c:11728)
File "pandas\parser.pyx", line 1044, in pandas.parser.TextReader._convert_column_data (pandas\parser.c:13129)
File "pandas\parser.pyx", line 2115, in pandas.parser._apply_converter (pandas\parser.c:28771)
TypeError: 'str' object is not callable
I am using Anaconda Spyder 3.1.2 and running Python 2.7.13.
I think you need remove ' for types, not string representation of types:
dtypes = {'OrderId': 'int', 'OrderDate': 'str', 'UserId': 'int', 'TotalCharges':'float'}
to:
dtypes = {'OrderId': int, 'OrderDate': str, 'UserId': int, 'TotalCharges': float}

Draw graph using network

Here's my code:
import networkx as nx
import matplotlib.pyplot as plt
fh =open('one.txt', 'r')
G=nx.read_edgelist(fh, nodetype=int)
fh.close()
print nx.info(G)
nx.draw(G)
plt .show()
But using it I get the following errors:
Traceback (most recent call last): File "graphDeBruijn.py", line 5,
in <module> G=nx.read_edgelist(fh, nodetype=int) File "<decorator-gen-286>", line 2,
in read_edgelist File "C:\PYTHON27\lib\site-packages\networkx-2.0.dev20161201181419-py2.7.egg\networkx\utils\decorators.py", line 221,
in _open_file result = func(*new_args, **kwargs) File "C:\PYTHON27\lib\site-packages\networkx-2.0.dev20161201181419-py2.7.egg\networkx\readwrite\edgelist.py", line 374,
in read_edgelist data=data) File "C:\PYTHON27\lib\site-packages\networkx-2.0.dev20161201181419-py2.7.egg\networkx\readwrite\edgelist.py", line 255,
in parse_edgelist for line in lines: File "C:\PYTHON27\lib\site-packages\networkx-2.0.dev20161201181419-py2.7.egg\networkx\readwrite\edgelist.py", line 371,
in <genexpr> lines = (line.decode(encoding) for line in path) File "C:\PYTHON27\lib\encodings\utf_8.py", line 16,
in decode return codecs.utf_8_decode(input, errors, True) UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte
Can anyone help me? Thanks!

pandas reading .csv files

I have a small script to read and print a .csv file using pandas generated from MS Excel.
import pandas as pd
data = pd.read_csv('./2010-11.csv')
print(data)
now this script runs in Python 2.7.8 but in Python 3.4.1 gives the following
error. Any ideas why this might be so? Thanks in advance for any help with this.
Traceback (most recent call last):
File "proc_csv_0-0.py", line 3, in <module>
data = pd.read_csv('./2010-11.csv')
File "/usr/lib64/python3.4/site-packages/pandas/io/parsers.py", line 474, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/lib64/python3.4/site-packages/pandas/io/parsers.py", line 260, in _read
return parser.read()
File "/usr/lib64/python3.4/site-packages/pandas/io/parsers.py", line 721, in read
ret = self._engine.read(nrows)
File "/usr/lib64/python3.4/site-packages/pandas/io/parsers.py", line 1170, in read
data = self._reader.read(nrows)
File "pandas/parser.pyx", line 769, in pandas.parser.TextReader.read (pandas/parser.c:7566)
File "pandas/parser.pyx", line 791, in pandas.parser.TextReader._read_low_memory (pandas/parser.c:7806)
File "pandas/parser.pyx", line 866, in pandas.parser.TextReader._read_rows (pandas/parser.c:8639)
File "pandas/parser.pyx", line 973, in pandas.parser.TextReader._convert_column_data (pandas/parser.c:9950)
File "pandas/parser.pyx", line 1033, in pandas.parser.TextReader._convert_tokens (pandas/parser.c:10737)
File "pandas/parser.pyx", line 1130, in pandas.parser.TextReader._convert_with_dtype (pandas/parser.c:12141)
File "pandas/parser.pyx", line 1150, in pandas.parser.TextReader._string_convert (pandas/parser.c:12355)
File "pandas/parser.pyx", line 1382, in pandas.parser._string_box_utf8 (pandas/parser.c:17679)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 4: unexpected end of data
In Python3, when pd.read_csv is passed a file path (as opposed to a file buffer) it decodes the contents with the utf-8 codec by default.1 It appears your CSV file is using a different encoding. Since it was generated by MS Excel, it might be cp-1252:
In [25]: print('\xc9'.decode('cp1252'))
É
In [27]: import unicodedata as UDAT
In [28]: UDAT.name('\xc9'.decode('cp1252'))
Out[28]: 'LATIN CAPITAL LETTER E WITH ACUTE'
The error message
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9
says that '\xc9'.decode('utf-8') raises a UnicodeDecodeError.
The above shows byte 0xc9 can be decoded with cp1252. It remains to be seen if the rest of the file can also be decoded with cp1252, and if it produces the desired result.
Unfortunately, given only a file, there is no surefire way to tell what
encoding (if any) was used. It depends entirely on the program used to generate
the file.
If cp1252 is the right encoding, then to load the file into a DataFrame use
data = pd.read_csv('./2010-11.csv', encoding='cp1252')
1 When pd.read_csv is passed a buffer, the buffer could have been opened with encoding already set:
# Python3
with open('/tmp/test.csv', 'r', encoding='cp1252') as f:
df = pd.read_csv(f)
print(df)
in which case pd.read_csv will not attempt to decode since the buffer f is already supplying decoded strings.

use random forest to classifier review, but hat key error?

I have follow code in python:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit( train_data_features, train["sentiment"] )
but have key error for "sentiment", I don't know why,
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
-Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site--packages/pandas/core/frame.py", line 1780, in __getitem__
return self._getitem_column(key)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.py", line 1787, in _getitem_column
return self._get_item_cache(key)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.py", line 1068, in _get_item_cache
values = self._data.get(item)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/internals.py", line 2849, in get
loc = self.items.get_loc(item)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/index.py", line 1402, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "pandas/index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas/index.c:3807)
File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:3687)
File "pandas/hashtable.pyx", line 696, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12310)
File "pandas/hashtable.pyx", line 704, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12261)
KeyError: 'sentiment'
Are you doing the Kaggle competition? https://www.kaggle.com/c/word2vec-nlp-tutorial/data
Are you sure you have downloaded and decompressed the file ok? The first part of the file reads:
id sentiment review
"5814_8" 1 "With all this stuff go
This works for me:
>>> train = pd.read_csv("labeledTrainData.tsv", delimiter="\t")
>>> train.columns
Index([u'id', u'sentiment', u'review'], dtype='object')
>>> train.head(3)
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w...
1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi...
2 7759_3 0 The film starts with a manager (Nicholas Bell)...
You should check the columns are setup correctly in the train variable. You should have a sentiment column. That column seems to be missing in your dataframe.

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position

When I try to extract some pattern from a tagged text in nltk, I have the error: UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 79: ordinal not in range(128). Firstly I had not this error, but I got it only after installing some packages.
this is the code:
# -*- coding: utf-8 -*-
import codecs
import sys
import re
import sys
import nltk
from nltk.corpus import *
k = nltk.corpus.brown.tagged_words('myfile')
for (w1,t1), (w2,t2) in nltk.bigrams(k):
if t1 == 'NN' and t2 == 'AJ':
print w1, w2
this is the entire output of the code.
Traceback (most recent call last):
File "/home/fathi/egfe.py", line 12, in <module>
for (w1,t1), (w2,t2) in nltk.bigrams(k):
File "/usr/local/lib/python2.7/dist-packages/nltk/util.py", line 442, in bigrams
for item in ngrams(sequence, 2, **kwargs):
File "/usr/local/lib/python2.7/dist-packages/nltk/util.py", line 419, in ngrams
history.append(next(sequence))
File "/usr/local/lib/python2.7/dist-packages/nltk/corpus/reader/util.py", line 291, in iterate_from
tokens = self.read_block(self._stream)
File "/usr/local/lib/python2.7/dist-packages/nltk/corpus/reader/tagged.py", line 241, in read_block
for para_str in self._para_block_reader(stream):
File "/usr/local/lib/python2.7/dist-packages/nltk/corpus/reader/util.py", line 564, in read_blankline_block
line = stream.readline()
File "/usr/local/lib/python2.7/dist-packages/nltk/data.py", line 1095, in readline
new_chars = self._read(readsize)
File "/usr/local/lib/python2.7/dist-packages/nltk/data.py", line 1322, in _read
chars, bytes_decoded = self._incr_decode(bytes)
File "/usr/local/lib/python2.7/dist-packages/nltk/data.py", line 1352, in _incr_decode
return self.decode(bytes, 'strict')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 79: ordinal not in range(128)
The problem is that the ntlk version is not compatabile with the python version, so it requires an older version of the nltk toolkit.