Python 3 Unicode Objects must be encoded before hashing - python-2.7

I use this code for hashing and salt:
def make_hash(password):
"""Generate a random salt and return a new hash for the password."""
if isinstance(password, str):
password = password.encode('utf-8')
salt = b64encode(urandom(SALT_LENGTH))
print (salt, type(salt))
#print (salt.encode('utf-8'), type(salt.encode('utf-8')))
return 'PBKDF2${}${}${}${}'.format(
HASH_FUNCTION,
COST_FACTOR,
salt,
b64encode(pbkdf2_bin(password, salt, COST_FACTOR, KEY_LENGTH,
getattr(hashlib, HASH_FUNCTION))))
Here is the pbkdf2_bin:
def pbkdf2_bin(data, salt, iterations=1000, keylen=24, hashfunc=None):
"""Returns a binary digest for the PBKDF2 hash algorithm of `data`
with the given `salt`. It iterates `iterations` time and produces a
key of `keylen` bytes. By default SHA-1 is used as hash function,
a different hashlib `hashfunc` can be provided.
"""
hashfunc = hashfunc or hashlib.sha1
mac = hmac.new(data, None, hashfunc)
def _pseudorandom(x, mac=mac):
h = mac.copy()
h.update(x)
return map(int, h.digest())
buf = []
for block in range(1, -(-keylen // mac.digest_size) + 1):
rv = u = _pseudorandom(salt + _pack_int(block))
for i in range(iterations - 1):
u = _pseudorandom(''.join(map(chr, u)))
rv = starmap(xor, zip(rv, u))
buf.extend(rv)
return ''.join(map(chr, buf))[:keylen]
I already adjusted some things as:
I replaced unicode -> str
I replaced izip -> zip
I changed this map(ord, h.digest()) -> map(int, h.digest())
For python 2 it works fine. I just jumped into python 3.
I am trying to fix this for 2 hours already, all solutions here do not work for me, probably I am missing something. As far as I understand somewhere I need simply to add .encode("utf-8") But I tryed already to put this everywhere. I thought it must be either the salt or the x in h.update(x)
I get the Unicode Objects must be encoded before hashing in these lines:
EDIT
I found the line where something happens if I encode, but it results in an other error.
u = _pseudorandom(''.join(map(chr, u)).encode("utf-8"))
results in:

Related

Counting matrix pairs using a threshold

I have a folder with hundreds of txt files I need to analyse for similarity. Below is an example of a script I use to run similarity analysis. In the end I get an array or a matrix I can plot etc.
I would like to see how many pairs there are with cos_similarity > 0.5 (or any other threshold I decide to use), removing cos_similarity == 1 when I compare the same files, of course.
Secondly, I need a list of these pairs based on file names.
So the output for the example below would look like:
1
and
["doc1", "doc4"]
Will really appreciate your help as I feel a bit lost not knowing which direction to go.
This is an example of my script to get the matrix:
doc1 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints that it is failing to meet that pledge."
doc2 = "The BBC has been inundated with comments from Amazon Prime customers. Most reported problems with deliveries."
doc3 = "An Amazon spokesman told the BBC the ASA had confirmed to it there was no investigation at this time."
doc4 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints..."
documents = [doc1, doc2, doc3, doc4]
# In my real script I iterate through a folder (path) with txt files like this:
#def read_text(path):
# documents = []
# for filename in glob.iglob(path+'*.txt'):
# _file = open(filename, 'r')
# text = _file.read()
# documents.append(text)
# return documents
import nltk, string, numpy
nltk.download('punkt') # first-time use only
stemmer = nltk.stem.porter.PorterStemmer()
def StemTokens(tokens):
return [stemmer.stem(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def StemNormalize(text):
return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(documents)
tf_matrix = LemVectorizer.transform(documents).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
tfidf_matrix = tfidfTran.transform(tf_matrix)
cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
def cos_similarity(textlist):
tfidf = TfidfVec.fit_transform(textlist)
return (tfidf * tfidf.T).toarray()
cos_similarity(documents)
Out:
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
As I understood your question, you want to create a function that reads the output numpy array and a certain value (threshold) in order to return two things:
how many docs are bigger than or equal the given threshold
the names of these docs.
So, here I've made the following function which takes three arguments:
the output numpy array from cos_similarity() function.
list of document names.
a certain number (threshold).
And here it's:
def get_docs(arr, docs_names, threshold):
output_tuples = []
for row in range(len(arr)):
lst = [row+1+idx for idx, num in \
enumerate(arr[row, row+1:]) if num >= threshold]
for item in lst:
output_tuples.append( (docs_names[row], docs_names[item]) )
return len(output_tuples), output_tuples
Let's see it in action:
>>> docs_names = ["doc1", "doc2", "doc3", "doc4"]
>>> arr = cos_similarity(documents)
>>> arr
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
>>> threshold = 0.5
>>> get_docs(arr, docs_names, threshold)
(1, [('doc1', 'doc4')])
>>> get_docs(arr, docs_names, 1)
(0, [])
>>> get_docs(lst, docs_names, 0.13)
(3, [('doc1', 'doc2'), ('doc1', 'doc4'), ('doc2', 'doc4')])
Let's see how this function works:
first, I iterate over every row of the numpy array.
Second, I iterate over every item in the row whose index is bigger than the row's index. So, we are iterating in a traingular shape like so:
and that's because each pair of documents is mentioned twice in the whole array. We can see that the two values arr[0][1] and arr[1][0] are the same. You also should notice that the diagonal items arn't included because we knew for sure that they are 1 as evey document is very similar to itself :).
Finally, we get the items whose values are bigger than or equal the given threshold, and return their indices. These indices are used later to get the documents names.

Extract elements from tuple to encode in python

I have a list of a list of tuples. With unicode problems.
I have be struggling to encode this into equivalent characters and I have been unsuccessful.
Here is a sample of my code:
import spaghetti as sgt
import codecs
f = codecs.open('output-data-pos', encoding='utf-8')
raw = f.read()
reviews = [raw.split()]
output_tagged = (sgt.pos_tag_sents(reviews))
Here is a sample of output_tagged produces.
[[(u'cerramos', None), (u'igual', u'aq0cs0'), (u'arrancado', None), (u'estanter\xeda', None), (u'\xe9xito', u'ncms000'), (u'an\xe9cdotas', u'ncfp000')]]
My overall objective is to extract each value from the tuple and encode it in utf-8 for a final result such as
cerramos None
igual aq0cs0
arrancado None
estantería None
éxito ncms000
anécdotas ncfp000
Some of the strategies that I have so far tried are from simple stratgies:
where i try to output the list and encode it directly
d = codecs.open('output-data-tagged', 'w', encoding='utf-8')
d.write(output_tagged)
or this approach
f = open('output-data-tagged', 'w')
for output in output_tagged:
output.encode('utf-8')
f.write(output)
f.close
where I first try to map the list and then encode it:
list_of_lists = map(list, output_tagged)
print list_of_lists
where I try functions to encode the data
def reprunicode(u):
return reprunicode(u).decode('raw_unicode_escape')
print u'[%s]' % u', '.join([u'(%s,)' % reprunicode(ti[0]) for ti in output_tagged])
this one too:
def utf8data(list):
return [item.decode('utf8') for item in list]
print utf8data(output_tagged)
Considering my many trials, how can I extract the elements from the tuple in the list of list in order to arrive at my desired final encoding results?

object returning memory location instead of value

So I have this class:
#!/usr/bin/python3
class MyClass(object):
def __init__(self, length):
self._list = length
def get(self, index):
try:
return self._list[index]
except IndexError:
return None
which takes in a list and returns a value, a list index I think. I am trying to get that value:
def my_function(a_list):
a_list = MyClass
for x in (10**p for p in range(1, 9)):
if a_list:
print(a_list)
def main():
length = my_function(MyClass([i for i in range(0, 543)]))
but I keep getting only the memory location of the list, I think this is supposed to return an int.
I am hoping this is a workable bit of code, but I am struggling, with the concept of passing an "object" to a class, it doesn't make any sense to me.
Here is a test I am supposed to use:
def test_large_list():
s_list = My_Class([i for i in xrange(0, 100000)])
assert len(s_list._list) == list_length(s_list)
Ok, Here is my full function that works, it is done, how od I do this so that the first line takes an argument
#!/usr/bin/python3
#def list_length(single_method_list): This is what I am supposed to work with
from single_method_list import SingleMethodList
def my_function(): # This is how I have done it and it works.
a_list = MyClass([i for i in range(0, 234589)])
for x in (10**p for p in range(1, 8)):
if a_list.get(x):
print("More than", x)
first = x
else:
print("Less than", x)
last = x
break
answer = False
while not answer:
result = (first + last)/2
result = int(round(result))
print(result)
if s_list.get(result):
first = result
print('first', result)
else:
last = result
print('last', result)
if s_list.get(result) and not s_list.get(result + 1):
answer = True
print(result + 1)
my_function()
I don't know what more I can give to explain where I am stuck, it is the OOP part of this that I don't know I need the same results here, just passing it to the function instead of creating it inside the function which I did in order to do the algorithm.
Well your class does something else.MyClass is designed to take a List at initialization, so the naming length is not a good idea.
The get() method of this class takes in a number and returns the element located at that particular index in the initialized self._list.
Your logic should be like:
def my_function(a_list):
a_list = MyClass(a_list)
...
def main():
length = my_function([i for i in range(0, 543)])
Just to clarify some misunderstanding that you might have.
Class does not return anything. It is a blueprint for creating objects.
What can return value is a method (function). For instance, if you want to write a method which returns length of some list:
def my_function(some_list):
return len(some_list)
Or in your case:
def my_function(a_list):
return len(a_list._list)
Note that you should not call your variables list. It's a built-in function in python which creates lists.
And as you can see there is another built-in function len in python which returns length of list, tuple, dictionary etc.
Hope this helps, although it's still a bit unclear what you're trying to achieve.

How do I extract part of a tuple that's duplicate as key to a dictionary, and have the second part of the tuple as value?

I'm pretty new to Python and Qgis, right now I'm just running scripts but I my end-goal is to create a plugin.
Here's the part of the code I'm having problems with:
import math
layer = qgis.utils.iface.activeLayer()
iter = layer.getFeatures()
dict = {}
#iterate over features
for feature in iter:
#print feature.id()
geom = feature.geometry()
coord = geom.asPolyline()
points=geom.asPolyline()
#get Endpoints
first = points[0]
last = points[-1]
#Assemble Features
dict[feature.id() ]= [first, last]
print dict
This is my result :
{0L: [(355277,6.68901e+06), (355385,6.68906e+06)], 1L: [(355238,6.68909e+06), (355340,6.68915e+06)], 2L: [(355340,6.68915e+06), (355452,6.68921e+06)], 3L: [(355340,6.68915e+06), (355364,6.6891e+06)], 4L: [(355364,6.6891e+06), (355385,6.68906e+06)], 5L: [(355261,6.68905e+06), (355364,6.6891e+06)], 6L: [(355364,6.6891e+06), (355481,6.68916e+06)], 7L: [(355385,6.68906e+06), (355501,6.68912e+06)]}
As you can see, many of the lines have a common endpoint:(355385,6.68906e+06) is shared by 7L, 4L and 0L for example.
I would like to create a new dictionary, fetching the shared points as a key, and having the second points as value.
eg : {(355385,6.68906e+06):[(355277,6.68901e+06), (355364,6.6891e+06), (355501,6.68912e+06)]}
I have been looking though list comprehension tutorials, but without much success: most people are looking to delete the duplicates, whereas I would like use them as keys (with unique IDs). Am I correct in thinking set() would still be useful?
I would be very grateful for any help, thanks in advance.
Maybe this is what you need?
dictionary = {}
for i in dict:
for j in dict:
c = set(dict[i]).intersection(set(dict[j]))
if len(c) == 1:
# ok, so now we know, that exactly one tuple exists in both
# sets at the same time, but this one will be the key to new dictionary
# we need the second tuple from the set to become value for this new key
# so we can subtract the key-tuple from set to get the other tuple
d = set(dict[i]).difference(c)
# Now we need to get tuple back from the set
# by doing list(c) we get list
# and our tuple is the first element in the list, thus list(c)[0]
c = list(c)[0]
dictionary[c] = list(d)[0]
else: pass
This code attaches only one tuple to the key in dictionary. If you want multiple values for each key, you can modify it so that each key would have a list of values, this can be done by simply modifying:
# some_value cannot be a set, it can be obtained with c = list(c)[0]
key = some_value
dictionary.setdefault(key, [])
dictionary[key].append(value)
So, the correct answer would be:
dictionary = {}
for i in a:
for j in a:
c = set(a[i]).intersection(set(a[j]))
if len(c) == 1:
d = set(a[i]).difference(c)
c = list(c)[0]
value = list(d)[0]
if c in dictionary and value not in dictionary[c]:
dictionary[c].append(value)
elif c not in dictionary:
dictionary.setdefault(c, [])
dictionary[c].append(value)
else: pass
See this code :
dict={0L: [(355277,6.68901e+06), (355385,6.68906e+06)], 1L: [(355238,6.68909e+06), (355340,6.68915e+06)], 2L: [(355340,6.68915e+06), (355452,6.68921e+06)], 3L: [(355340,6.68915e+06), (355364,6.6891e+06)], 4L: [(355364,6.6891e+06), (355385,6.68906e+06)], 5L: [(355261,6.68905e+06), (355364,6.6891e+06)], 6L: [(355364,6.6891e+06), (355481,6.68916e+06)], 7L: [(355385,6.68906e+06), (355501,6.68912e+06)]}
dictionary = {}
list=[]
for item in dict :
list.append(dict[0])
list.append(dict[1])
b = []
[b.append(x) for c in list for x in c if x not in b]
print b # or set(b)
res={}
for elm in b :
lst=[]
for item in dict :
if dict[item][0] == elm :
lst.append(dict[item][1])
elif dict[item][1] == elm :
lst.append(dict[item][0])
res[elm]=lst
print res

Creating a list of sums

I'm newbie in Python and I'm struggling in create a list of sums generated by a for loop.
I got an school assignment where my program have to simulate the scores of a class of blind students in a multiple choice test.
def blindwalk(): # Generates the blind answers in a test with 21 questions
import random
resp = []
gab = ["a","b","c","d"]
for n in range(0,21):
resp.append(random.choice(gab))
return(resp)
def gabarite(): # Generates the official answer key of the tests
import random
answ_gab = []
gab = ["a","b","c","d"]
for n in range(0,21):
answ_gab.append(random.choice(gab))
return(answ_gab)
def class_tests(A): # A is the number of students
alumni = []
A = int(A)
for a in range(0,A):
alumni.append(blindwalk())
return alumni
def class_total(A): # A is the number of students
A = int(A)
official_gab = gabarite()
tests = class_tests(A)
total_score = []*0
for a in range(0,A):
for n in range(0,21):
if tests[a][n] == official_gab[n]:
total_score[a].add(1)
return total_score
When I run the class_total() function, I get this error:
total_score[a].add(1)
IndexError: list index out of range
Question is: How I valuate the scores of each student and create a list with them, because this is what I want to do with the class_total() function.
I also tried
if tests[a][n] == official_gab[n]:
total_score[a] += 1
But I got the same error, so I think I don't fully understand how lists work in Python yet.
Thanks!
(Also, I'm not a English native-speaker, so please tell me if I couldn't be clear enough)
This line:
total_score = []*0
And in fact, any of the following lines:
total_score = []*30
total_score = []*3000
total_score = []*300000000
Cause total_score to be instantiated as an empty list. It doesn't even have a 0th index, in this case! If you'd like to initiate every value to x in a list of length l , the syntax would look more like:
my_list = [x]*l
Alternatively, instead of thinking about the size before-hand, you can use .append instead of trying to access a particular index, as in:
my_list = []
my_list.append(200)
# my_list is now [200], my_list[0] is now 200
my_list.append(300)
# my_list is now [200,300], my_list[0] is still 200 and my_list[1] is now 300