Extract elements from tuple to encode in python - list

I have a list of a list of tuples. With unicode problems.
I have be struggling to encode this into equivalent characters and I have been unsuccessful.
Here is a sample of my code:
import spaghetti as sgt
import codecs
f = codecs.open('output-data-pos', encoding='utf-8')
raw = f.read()
reviews = [raw.split()]
output_tagged = (sgt.pos_tag_sents(reviews))
Here is a sample of output_tagged produces.
[[(u'cerramos', None), (u'igual', u'aq0cs0'), (u'arrancado', None), (u'estanter\xeda', None), (u'\xe9xito', u'ncms000'), (u'an\xe9cdotas', u'ncfp000')]]
My overall objective is to extract each value from the tuple and encode it in utf-8 for a final result such as
cerramos None
igual aq0cs0
arrancado None
estantería None
éxito ncms000
anécdotas ncfp000
Some of the strategies that I have so far tried are from simple stratgies:
where i try to output the list and encode it directly
d = codecs.open('output-data-tagged', 'w', encoding='utf-8')
d.write(output_tagged)
or this approach
f = open('output-data-tagged', 'w')
for output in output_tagged:
output.encode('utf-8')
f.write(output)
f.close
where I first try to map the list and then encode it:
list_of_lists = map(list, output_tagged)
print list_of_lists
where I try functions to encode the data
def reprunicode(u):
return reprunicode(u).decode('raw_unicode_escape')
print u'[%s]' % u', '.join([u'(%s,)' % reprunicode(ti[0]) for ti in output_tagged])
this one too:
def utf8data(list):
return [item.decode('utf8') for item in list]
print utf8data(output_tagged)
Considering my many trials, how can I extract the elements from the tuple in the list of list in order to arrive at my desired final encoding results?

Related

Counting matrix pairs using a threshold

I have a folder with hundreds of txt files I need to analyse for similarity. Below is an example of a script I use to run similarity analysis. In the end I get an array or a matrix I can plot etc.
I would like to see how many pairs there are with cos_similarity > 0.5 (or any other threshold I decide to use), removing cos_similarity == 1 when I compare the same files, of course.
Secondly, I need a list of these pairs based on file names.
So the output for the example below would look like:
1
and
["doc1", "doc4"]
Will really appreciate your help as I feel a bit lost not knowing which direction to go.
This is an example of my script to get the matrix:
doc1 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints that it is failing to meet that pledge."
doc2 = "The BBC has been inundated with comments from Amazon Prime customers. Most reported problems with deliveries."
doc3 = "An Amazon spokesman told the BBC the ASA had confirmed to it there was no investigation at this time."
doc4 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints..."
documents = [doc1, doc2, doc3, doc4]
# In my real script I iterate through a folder (path) with txt files like this:
#def read_text(path):
# documents = []
# for filename in glob.iglob(path+'*.txt'):
# _file = open(filename, 'r')
# text = _file.read()
# documents.append(text)
# return documents
import nltk, string, numpy
nltk.download('punkt') # first-time use only
stemmer = nltk.stem.porter.PorterStemmer()
def StemTokens(tokens):
return [stemmer.stem(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def StemNormalize(text):
return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(documents)
tf_matrix = LemVectorizer.transform(documents).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
tfidf_matrix = tfidfTran.transform(tf_matrix)
cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
def cos_similarity(textlist):
tfidf = TfidfVec.fit_transform(textlist)
return (tfidf * tfidf.T).toarray()
cos_similarity(documents)
Out:
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
As I understood your question, you want to create a function that reads the output numpy array and a certain value (threshold) in order to return two things:
how many docs are bigger than or equal the given threshold
the names of these docs.
So, here I've made the following function which takes three arguments:
the output numpy array from cos_similarity() function.
list of document names.
a certain number (threshold).
And here it's:
def get_docs(arr, docs_names, threshold):
output_tuples = []
for row in range(len(arr)):
lst = [row+1+idx for idx, num in \
enumerate(arr[row, row+1:]) if num >= threshold]
for item in lst:
output_tuples.append( (docs_names[row], docs_names[item]) )
return len(output_tuples), output_tuples
Let's see it in action:
>>> docs_names = ["doc1", "doc2", "doc3", "doc4"]
>>> arr = cos_similarity(documents)
>>> arr
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
>>> threshold = 0.5
>>> get_docs(arr, docs_names, threshold)
(1, [('doc1', 'doc4')])
>>> get_docs(arr, docs_names, 1)
(0, [])
>>> get_docs(lst, docs_names, 0.13)
(3, [('doc1', 'doc2'), ('doc1', 'doc4'), ('doc2', 'doc4')])
Let's see how this function works:
first, I iterate over every row of the numpy array.
Second, I iterate over every item in the row whose index is bigger than the row's index. So, we are iterating in a traingular shape like so:
and that's because each pair of documents is mentioned twice in the whole array. We can see that the two values arr[0][1] and arr[1][0] are the same. You also should notice that the diagonal items arn't included because we knew for sure that they are 1 as evey document is very similar to itself :).
Finally, we get the items whose values are bigger than or equal the given threshold, and return their indices. These indices are used later to get the documents names.

Python 3 Unicode Objects must be encoded before hashing

I use this code for hashing and salt:
def make_hash(password):
"""Generate a random salt and return a new hash for the password."""
if isinstance(password, str):
password = password.encode('utf-8')
salt = b64encode(urandom(SALT_LENGTH))
print (salt, type(salt))
#print (salt.encode('utf-8'), type(salt.encode('utf-8')))
return 'PBKDF2${}${}${}${}'.format(
HASH_FUNCTION,
COST_FACTOR,
salt,
b64encode(pbkdf2_bin(password, salt, COST_FACTOR, KEY_LENGTH,
getattr(hashlib, HASH_FUNCTION))))
Here is the pbkdf2_bin:
def pbkdf2_bin(data, salt, iterations=1000, keylen=24, hashfunc=None):
"""Returns a binary digest for the PBKDF2 hash algorithm of `data`
with the given `salt`. It iterates `iterations` time and produces a
key of `keylen` bytes. By default SHA-1 is used as hash function,
a different hashlib `hashfunc` can be provided.
"""
hashfunc = hashfunc or hashlib.sha1
mac = hmac.new(data, None, hashfunc)
def _pseudorandom(x, mac=mac):
h = mac.copy()
h.update(x)
return map(int, h.digest())
buf = []
for block in range(1, -(-keylen // mac.digest_size) + 1):
rv = u = _pseudorandom(salt + _pack_int(block))
for i in range(iterations - 1):
u = _pseudorandom(''.join(map(chr, u)))
rv = starmap(xor, zip(rv, u))
buf.extend(rv)
return ''.join(map(chr, buf))[:keylen]
I already adjusted some things as:
I replaced unicode -> str
I replaced izip -> zip
I changed this map(ord, h.digest()) -> map(int, h.digest())
For python 2 it works fine. I just jumped into python 3.
I am trying to fix this for 2 hours already, all solutions here do not work for me, probably I am missing something. As far as I understand somewhere I need simply to add .encode("utf-8") But I tryed already to put this everywhere. I thought it must be either the salt or the x in h.update(x)
I get the Unicode Objects must be encoded before hashing in these lines:
EDIT
I found the line where something happens if I encode, but it results in an other error.
u = _pseudorandom(''.join(map(chr, u)).encode("utf-8"))
results in:

Can't merge two lists into a dictionary

I can't merge two lists into a dictionary.I tried the following :
Map two lists into a dictionary in Python
I tried all solutions and I still get an empty dictionary
from sklearn.feature_extraction import DictVectorizer
from itertools import izip
import itertools
text_file = open("/home/vesko_/evnt_classification/bag_of_words", "r")
text_fiel2 = open("/home/vesko_/evnt_classification/sdas", "r")
lines = text_file.read().split('\n')
words = text_fiel2.read().split('\n')
diction = dict(itertools.izip(words,lines))
new_dict = {k: v for k, v in zip(words, lines)}
print new_dict
I get the following :
{'word': ''}
['word=']
The two lists are not empty.
I'm using python2.7
EDIT :
Output from the two lists (I'm only showing a few because it's a vector with 11k features)
//lines
['change', 'I/O', 'fcnet2', 'ifconfig',....
//words
['word', 'word', 'word', .....
EDIT :
Now at least I have some output #DamianLattenero
{'word\n': 'XXAMSDB35:XXAMSDB35_NGCEAC_DAT_L_Drivei\n'}
['word\n=XXAMSDB35:XXAMSDB35_NGCEAC_DAT_L_Drivei\n']
I think the root of a lot of confusion is code in the example that is not relevant.
Try this:
text_file = open("/home/vesko_/evnt_classification/bag_of_words", "r")
text_fiel2 = open("/home/vesko_/evnt_classification/sdas", "r")
lines = text_file.read().split('\n')
words = text_fiel2.read().split('\n')
# to remove any extra newline or whitespace from what was read in
map(lambda line: line.rstrip(), lines)
map(lambda word: word.rstrip(), words)
new_dict = dict(zip(words,lines))
print new_dict
Python builtin zip() returns an iterable of tuples from each of the arguments. Giving this iterable of tuples to the dict() object constructor creates a dictionary where each of the items in words is the key and items in lines is the corresponding value.
Also note that if the words file has more items than lines then there will either keys with empty values. If lines has items then only the last one will be added with an None key.
I tryed this and worked for me, I created two files, added numbers 1 to 4, letters a to d, and the code creates the dictionary ok, I didn't need to import itertools, actually there is an extra line not needed:
lines = [1,2,3,4]
words = ["a","b","c","d"]
diction = dict(zip(words,lines))
# new_dict = {k: v for k, v in zip(words, lines)}
print(diction)
{'a': 1, 'b': 2, 'c': 3, 'd': 4}
If that worked, and not the other, you must have a problem in loading the list, try loading like this:
def create_list_from_file(file):
with open(file, "r") as ins:
my_list = []
for line in ins:
my_list.append(line)
return my_list
lines = create_list_from_file("/home/vesko_/evnt_classification/bag_of_words")
words = create_list_from_file("/home/vesko_/evnt_classification/sdas")
diction = dict(zip(words,lines))
# new_dict = {k: v for k, v in zip(words, lines)}
print(diction)
Observation:
If you files.txt looks like this:
1
2
3
4
and
a
b
c
d
the result will have for keys in the dictionary, one per line:
{'a\n': '1\n', 'b\n': '2\n', 'c\n': '3\n', 'd': '4'}
But if you file looks like:
1 2 3 4
and
a b c d
the result will be {'a b c d': '1 2 3 4'}, only one value

Python - Convert dictionary (having "list" as values) into csv file

Trying to write below dictionary into csv file with desired output as mentioned below.
dict_data = {"1":["xyz"],
"2":["abc","def"],
"3":["zzz"]
}
desired output:
1,3,2
xyz,zzz,abc
def
Below code doesn't work as expected as it keeps both "abc" & "def" in same cell as shown below.
with open('k.csv','wb') as out_file:
writer = csv.writer(out_file,dialect = 'excel')
headers = [k for k in dict_data]
items = [dict_data[k] for k in dict_data]
writer.writerow(headers)
writer.writerow(items)
output:
1,3,2
xyz,zzz,abc,def
Here is the complete solution:
import csv
import os
class CsvfileWriter:
'''
Takes dictionary as input and writes items into a CSV file.
For ex:-
Input dictionary:
dict_data = {"1":["xyz"],"2":["abc","def"],"3":["zzz"]}
Output: (CSV file)
1,3,2
xyz,zzz,abc
,,def
'''
def __init__(self,dictInput,maxLength=0):
'''
Creates a instance with following variables.
dictInput & maxLength
dictInput -> dictionary having values(list) of same length
ex:-
dict_data = {"1":["xyz",""],"2":["abc","def"],"3":["zzz",""]}
maxLength -> length of the list
'''
self.dictInput = dictInput
self.maxLength = maxLength
#classmethod
def list_padding(cls,dictInput):
'''
converts input dictionary having list (as values) of varying lenghts into constant length.
Also returns class variables dictInput & maxLength
Note:
dictInput represents the dictionary after padding is applied.
maxLength represents the length of the list(values in dictionary) having maximum number of items.
Ex:-
input dictionary:
dict_data = {"1":["xyz"],"2":["abc","def"],"3":["zzz"]}
output dictionary:
dict_data = {"1":["xyz",""],"2":["abc","def"],"3":["zzz",""]}
'''
cls.dictInput = dictInput
listValues = dictInput.values()
listValues.sort(key = lambda i: len(i))
maxLength = len(listValues[-1])
for i in listValues:
while(len(i) < maxLength):
i.append('')
return cls(dictInput,maxLength)
def write_to_csv(self):
with open('sample_file.csv','wb') as out_file:
writer = csv.writer(out_file,dialect = 'excel')
headers = [k for k in self.dictInput]
items = [self.dictInput[k] for k in self.dictInput]
writer.writerow(headers)
c = 0
while (c < self.maxLength):
writer.writerow([i[c] for i in items])
c += 1
dict_data = {"1":["xyz"],"2":["abc","def"],"3":["zzz"]}
cf = CsvfileWriter.list_padding(dict_data)
cf.write_to_csv()
The following works in Python 2:
import csv
dict_data = {
"1":["xyz"],
"2":["abc","def"],
"3":["zzz"]
}
def transpose(cols):
return map(lambda *row: list(row), *cols)
with open('k.csv','w') as out_file:
writer = csv.writer(out_file,dialect = 'excel')
headers = dict_data.keys()
items = transpose(dict_data.values())
writer.writerow(headers)
writer.writerows(items)
I can't take credit for the transpose function, which I picked up from here. It turns a list of columns into a list of rows, automatically padding columns that are too short with None. Fortunately, the csv writer outputs blanks for None values, which is exactly what's needed.
(In Python 3, map behaves differently (no padding), so it would require some changes.)
Edit: A replacement transpose function that works for both Python 2 and 3 is:
def transpose(cols):
def mypop(l):
try:
return l.pop(0)
except IndexError:
return ''
while any(cols):
yield [mypop(l) for l in cols]

How to sort python lists due to certain criteria

I would like to sort a list or an array using python to achive the following:
Say my initial list is:
example_list = ["retg_1_gertg","fsvs_1_vs","vrtv_2_srtv","srtv_2_bzt","wft_3_btb","tvsrt_3_rtbbrz"]
I would like to get all the elements that have 1 behind the first underscore together in one list and the ones that have 2 together in one list and so on. So the result should be:
sorted_list = [["retg_1_gertg","fsvs_1_vs"],["vrtv_2_srtv","srtv_2_bzt"],["wft_3_btb","tvsrt_3_rtbbrz"]]
My code:
import numpy as np
import string
example_list = ["retg_1_gertg","fsvs_1_vs","vrtv_2_srtv","srtv_2_bzt","wft_3_btb","tvsrt_3_rtbbrz"]
def sort_list(imagelist):
# get number of wafers
waferlist = []
for image in imagelist:
wafer_id = string.split(image,"_")[1]
waferlist.append(wafer_id)
waferlist = set(waferlist)
waferlist = list(waferlist)
number_of_wafers = len(waferlist)
# create list
sorted_list = []
for i in range(number_of_wafers):
sorted_list.append([])
for i in range(number_of_wafers):
wafer_id = waferlist[i]
for image in imagelist:
if string.split(image,"_")[1] == wafer_id:
sorted_list[i].append(image)
return sorted_list
sorted_list = sort_list(example_list)
works but it is really awkward and it involves many for loops that slow down everything if the lists are large.
Is there any more elegant way using numpy or anything?
Help is appreciated. Thanks.
I'm not sure how much more elegant this solution is; it is a bit more efficient. You could first sort the list and then go through and filter into final set of sorted lists:
example_list = ["retg_1_gertg","fsvs_1_vs","vrtv_2_srtv","srtv_2_bzt","wft_3_btb","tvsrt_3_rtbbrz"]
sorted_list = sorted(example_list, key=lambda x: x[x.index('_')+1])
result = [[]]
current_num = sorted_list[0][sorted_list[0].index('_')+1]
index = 0
for i in example_list:
if current_num != i[i.index('_')+1]:
current_num = i[i.index('_')+1]
index += 1
result.append([])
result[index].append(i)
print result
If you can make assumptions about the values after the first underscore character, you could clean it up a bit (for example, if you knew that they would always be sequential numbers starting at 1).