I use this code for hashing and salt:
def make_hash(password):
"""Generate a random salt and return a new hash for the password."""
if isinstance(password, str):
password = password.encode('utf-8')
salt = b64encode(urandom(SALT_LENGTH))
print (salt, type(salt))
#print (salt.encode('utf-8'), type(salt.encode('utf-8')))
return 'PBKDF2${}${}${}${}'.format(
HASH_FUNCTION,
COST_FACTOR,
salt,
b64encode(pbkdf2_bin(password, salt, COST_FACTOR, KEY_LENGTH,
getattr(hashlib, HASH_FUNCTION))))
Here is the pbkdf2_bin:
def pbkdf2_bin(data, salt, iterations=1000, keylen=24, hashfunc=None):
"""Returns a binary digest for the PBKDF2 hash algorithm of `data`
with the given `salt`. It iterates `iterations` time and produces a
key of `keylen` bytes. By default SHA-1 is used as hash function,
a different hashlib `hashfunc` can be provided.
"""
hashfunc = hashfunc or hashlib.sha1
mac = hmac.new(data, None, hashfunc)
def _pseudorandom(x, mac=mac):
h = mac.copy()
h.update(x)
return map(int, h.digest())
buf = []
for block in range(1, -(-keylen // mac.digest_size) + 1):
rv = u = _pseudorandom(salt + _pack_int(block))
for i in range(iterations - 1):
u = _pseudorandom(''.join(map(chr, u)))
rv = starmap(xor, zip(rv, u))
buf.extend(rv)
return ''.join(map(chr, buf))[:keylen]
I already adjusted some things as:
I replaced unicode -> str
I replaced izip -> zip
I changed this map(ord, h.digest()) -> map(int, h.digest())
For python 2 it works fine. I just jumped into python 3.
I am trying to fix this for 2 hours already, all solutions here do not work for me, probably I am missing something. As far as I understand somewhere I need simply to add .encode("utf-8") But I tryed already to put this everywhere. I thought it must be either the salt or the x in h.update(x)
I get the Unicode Objects must be encoded before hashing in these lines:
EDIT
I found the line where something happens if I encode, but it results in an other error.
u = _pseudorandom(''.join(map(chr, u)).encode("utf-8"))
results in:
Related
I have a folder with hundreds of txt files I need to analyse for similarity. Below is an example of a script I use to run similarity analysis. In the end I get an array or a matrix I can plot etc.
I would like to see how many pairs there are with cos_similarity > 0.5 (or any other threshold I decide to use), removing cos_similarity == 1 when I compare the same files, of course.
Secondly, I need a list of these pairs based on file names.
So the output for the example below would look like:
1
and
["doc1", "doc4"]
Will really appreciate your help as I feel a bit lost not knowing which direction to go.
This is an example of my script to get the matrix:
doc1 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints that it is failing to meet that pledge."
doc2 = "The BBC has been inundated with comments from Amazon Prime customers. Most reported problems with deliveries."
doc3 = "An Amazon spokesman told the BBC the ASA had confirmed to it there was no investigation at this time."
doc4 = "Amazon's promise of next-day deliveries could be investigated amid customer complaints..."
documents = [doc1, doc2, doc3, doc4]
# In my real script I iterate through a folder (path) with txt files like this:
#def read_text(path):
# documents = []
# for filename in glob.iglob(path+'*.txt'):
# _file = open(filename, 'r')
# text = _file.read()
# documents.append(text)
# return documents
import nltk, string, numpy
nltk.download('punkt') # first-time use only
stemmer = nltk.stem.porter.PorterStemmer()
def StemTokens(tokens):
return [stemmer.stem(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def StemNormalize(text):
return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(documents)
tf_matrix = LemVectorizer.transform(documents).toarray()
from sklearn.feature_extraction.text import TfidfTransformer
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
tfidf_matrix = tfidfTran.transform(tf_matrix)
cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
def cos_similarity(textlist):
tfidf = TfidfVec.fit_transform(textlist)
return (tfidf * tfidf.T).toarray()
cos_similarity(documents)
Out:
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
As I understood your question, you want to create a function that reads the output numpy array and a certain value (threshold) in order to return two things:
how many docs are bigger than or equal the given threshold
the names of these docs.
So, here I've made the following function which takes three arguments:
the output numpy array from cos_similarity() function.
list of document names.
a certain number (threshold).
And here it's:
def get_docs(arr, docs_names, threshold):
output_tuples = []
for row in range(len(arr)):
lst = [row+1+idx for idx, num in \
enumerate(arr[row, row+1:]) if num >= threshold]
for item in lst:
output_tuples.append( (docs_names[row], docs_names[item]) )
return len(output_tuples), output_tuples
Let's see it in action:
>>> docs_names = ["doc1", "doc2", "doc3", "doc4"]
>>> arr = cos_similarity(documents)
>>> arr
array([[ 1. , 0.1459739 , 0.03613371, 0.76357693],
[ 0.1459739 , 1. , 0.11459266, 0.19117117],
[ 0.03613371, 0.11459266, 1. , 0.04732164],
[ 0.76357693, 0.19117117, 0.04732164, 1. ]])
>>> threshold = 0.5
>>> get_docs(arr, docs_names, threshold)
(1, [('doc1', 'doc4')])
>>> get_docs(arr, docs_names, 1)
(0, [])
>>> get_docs(lst, docs_names, 0.13)
(3, [('doc1', 'doc2'), ('doc1', 'doc4'), ('doc2', 'doc4')])
Let's see how this function works:
first, I iterate over every row of the numpy array.
Second, I iterate over every item in the row whose index is bigger than the row's index. So, we are iterating in a traingular shape like so:
and that's because each pair of documents is mentioned twice in the whole array. We can see that the two values arr[0][1] and arr[1][0] are the same. You also should notice that the diagonal items arn't included because we knew for sure that they are 1 as evey document is very similar to itself :).
Finally, we get the items whose values are bigger than or equal the given threshold, and return their indices. These indices are used later to get the documents names.
I have a list of a list of tuples. With unicode problems.
I have be struggling to encode this into equivalent characters and I have been unsuccessful.
Here is a sample of my code:
import spaghetti as sgt
import codecs
f = codecs.open('output-data-pos', encoding='utf-8')
raw = f.read()
reviews = [raw.split()]
output_tagged = (sgt.pos_tag_sents(reviews))
Here is a sample of output_tagged produces.
[[(u'cerramos', None), (u'igual', u'aq0cs0'), (u'arrancado', None), (u'estanter\xeda', None), (u'\xe9xito', u'ncms000'), (u'an\xe9cdotas', u'ncfp000')]]
My overall objective is to extract each value from the tuple and encode it in utf-8 for a final result such as
cerramos None
igual aq0cs0
arrancado None
estantería None
éxito ncms000
anécdotas ncfp000
Some of the strategies that I have so far tried are from simple stratgies:
where i try to output the list and encode it directly
d = codecs.open('output-data-tagged', 'w', encoding='utf-8')
d.write(output_tagged)
or this approach
f = open('output-data-tagged', 'w')
for output in output_tagged:
output.encode('utf-8')
f.write(output)
f.close
where I first try to map the list and then encode it:
list_of_lists = map(list, output_tagged)
print list_of_lists
where I try functions to encode the data
def reprunicode(u):
return reprunicode(u).decode('raw_unicode_escape')
print u'[%s]' % u', '.join([u'(%s,)' % reprunicode(ti[0]) for ti in output_tagged])
this one too:
def utf8data(list):
return [item.decode('utf8') for item in list]
print utf8data(output_tagged)
Considering my many trials, how can I extract the elements from the tuple in the list of list in order to arrive at my desired final encoding results?
So I have this class:
#!/usr/bin/python3
class MyClass(object):
def __init__(self, length):
self._list = length
def get(self, index):
try:
return self._list[index]
except IndexError:
return None
which takes in a list and returns a value, a list index I think. I am trying to get that value:
def my_function(a_list):
a_list = MyClass
for x in (10**p for p in range(1, 9)):
if a_list:
print(a_list)
def main():
length = my_function(MyClass([i for i in range(0, 543)]))
but I keep getting only the memory location of the list, I think this is supposed to return an int.
I am hoping this is a workable bit of code, but I am struggling, with the concept of passing an "object" to a class, it doesn't make any sense to me.
Here is a test I am supposed to use:
def test_large_list():
s_list = My_Class([i for i in xrange(0, 100000)])
assert len(s_list._list) == list_length(s_list)
Ok, Here is my full function that works, it is done, how od I do this so that the first line takes an argument
#!/usr/bin/python3
#def list_length(single_method_list): This is what I am supposed to work with
from single_method_list import SingleMethodList
def my_function(): # This is how I have done it and it works.
a_list = MyClass([i for i in range(0, 234589)])
for x in (10**p for p in range(1, 8)):
if a_list.get(x):
print("More than", x)
first = x
else:
print("Less than", x)
last = x
break
answer = False
while not answer:
result = (first + last)/2
result = int(round(result))
print(result)
if s_list.get(result):
first = result
print('first', result)
else:
last = result
print('last', result)
if s_list.get(result) and not s_list.get(result + 1):
answer = True
print(result + 1)
my_function()
I don't know what more I can give to explain where I am stuck, it is the OOP part of this that I don't know I need the same results here, just passing it to the function instead of creating it inside the function which I did in order to do the algorithm.
Well your class does something else.MyClass is designed to take a List at initialization, so the naming length is not a good idea.
The get() method of this class takes in a number and returns the element located at that particular index in the initialized self._list.
Your logic should be like:
def my_function(a_list):
a_list = MyClass(a_list)
...
def main():
length = my_function([i for i in range(0, 543)])
Just to clarify some misunderstanding that you might have.
Class does not return anything. It is a blueprint for creating objects.
What can return value is a method (function). For instance, if you want to write a method which returns length of some list:
def my_function(some_list):
return len(some_list)
Or in your case:
def my_function(a_list):
return len(a_list._list)
Note that you should not call your variables list. It's a built-in function in python which creates lists.
And as you can see there is another built-in function len in python which returns length of list, tuple, dictionary etc.
Hope this helps, although it's still a bit unclear what you're trying to achieve.
I'm pretty new to Python and Qgis, right now I'm just running scripts but I my end-goal is to create a plugin.
Here's the part of the code I'm having problems with:
import math
layer = qgis.utils.iface.activeLayer()
iter = layer.getFeatures()
dict = {}
#iterate over features
for feature in iter:
#print feature.id()
geom = feature.geometry()
coord = geom.asPolyline()
points=geom.asPolyline()
#get Endpoints
first = points[0]
last = points[-1]
#Assemble Features
dict[feature.id() ]= [first, last]
print dict
This is my result :
{0L: [(355277,6.68901e+06), (355385,6.68906e+06)], 1L: [(355238,6.68909e+06), (355340,6.68915e+06)], 2L: [(355340,6.68915e+06), (355452,6.68921e+06)], 3L: [(355340,6.68915e+06), (355364,6.6891e+06)], 4L: [(355364,6.6891e+06), (355385,6.68906e+06)], 5L: [(355261,6.68905e+06), (355364,6.6891e+06)], 6L: [(355364,6.6891e+06), (355481,6.68916e+06)], 7L: [(355385,6.68906e+06), (355501,6.68912e+06)]}
As you can see, many of the lines have a common endpoint:(355385,6.68906e+06) is shared by 7L, 4L and 0L for example.
I would like to create a new dictionary, fetching the shared points as a key, and having the second points as value.
eg : {(355385,6.68906e+06):[(355277,6.68901e+06), (355364,6.6891e+06), (355501,6.68912e+06)]}
I have been looking though list comprehension tutorials, but without much success: most people are looking to delete the duplicates, whereas I would like use them as keys (with unique IDs). Am I correct in thinking set() would still be useful?
I would be very grateful for any help, thanks in advance.
Maybe this is what you need?
dictionary = {}
for i in dict:
for j in dict:
c = set(dict[i]).intersection(set(dict[j]))
if len(c) == 1:
# ok, so now we know, that exactly one tuple exists in both
# sets at the same time, but this one will be the key to new dictionary
# we need the second tuple from the set to become value for this new key
# so we can subtract the key-tuple from set to get the other tuple
d = set(dict[i]).difference(c)
# Now we need to get tuple back from the set
# by doing list(c) we get list
# and our tuple is the first element in the list, thus list(c)[0]
c = list(c)[0]
dictionary[c] = list(d)[0]
else: pass
This code attaches only one tuple to the key in dictionary. If you want multiple values for each key, you can modify it so that each key would have a list of values, this can be done by simply modifying:
# some_value cannot be a set, it can be obtained with c = list(c)[0]
key = some_value
dictionary.setdefault(key, [])
dictionary[key].append(value)
So, the correct answer would be:
dictionary = {}
for i in a:
for j in a:
c = set(a[i]).intersection(set(a[j]))
if len(c) == 1:
d = set(a[i]).difference(c)
c = list(c)[0]
value = list(d)[0]
if c in dictionary and value not in dictionary[c]:
dictionary[c].append(value)
elif c not in dictionary:
dictionary.setdefault(c, [])
dictionary[c].append(value)
else: pass
See this code :
dict={0L: [(355277,6.68901e+06), (355385,6.68906e+06)], 1L: [(355238,6.68909e+06), (355340,6.68915e+06)], 2L: [(355340,6.68915e+06), (355452,6.68921e+06)], 3L: [(355340,6.68915e+06), (355364,6.6891e+06)], 4L: [(355364,6.6891e+06), (355385,6.68906e+06)], 5L: [(355261,6.68905e+06), (355364,6.6891e+06)], 6L: [(355364,6.6891e+06), (355481,6.68916e+06)], 7L: [(355385,6.68906e+06), (355501,6.68912e+06)]}
dictionary = {}
list=[]
for item in dict :
list.append(dict[0])
list.append(dict[1])
b = []
[b.append(x) for c in list for x in c if x not in b]
print b # or set(b)
res={}
for elm in b :
lst=[]
for item in dict :
if dict[item][0] == elm :
lst.append(dict[item][1])
elif dict[item][1] == elm :
lst.append(dict[item][0])
res[elm]=lst
print res
I'm newbie in Python and I'm struggling in create a list of sums generated by a for loop.
I got an school assignment where my program have to simulate the scores of a class of blind students in a multiple choice test.
def blindwalk(): # Generates the blind answers in a test with 21 questions
import random
resp = []
gab = ["a","b","c","d"]
for n in range(0,21):
resp.append(random.choice(gab))
return(resp)
def gabarite(): # Generates the official answer key of the tests
import random
answ_gab = []
gab = ["a","b","c","d"]
for n in range(0,21):
answ_gab.append(random.choice(gab))
return(answ_gab)
def class_tests(A): # A is the number of students
alumni = []
A = int(A)
for a in range(0,A):
alumni.append(blindwalk())
return alumni
def class_total(A): # A is the number of students
A = int(A)
official_gab = gabarite()
tests = class_tests(A)
total_score = []*0
for a in range(0,A):
for n in range(0,21):
if tests[a][n] == official_gab[n]:
total_score[a].add(1)
return total_score
When I run the class_total() function, I get this error:
total_score[a].add(1)
IndexError: list index out of range
Question is: How I valuate the scores of each student and create a list with them, because this is what I want to do with the class_total() function.
I also tried
if tests[a][n] == official_gab[n]:
total_score[a] += 1
But I got the same error, so I think I don't fully understand how lists work in Python yet.
Thanks!
(Also, I'm not a English native-speaker, so please tell me if I couldn't be clear enough)
This line:
total_score = []*0
And in fact, any of the following lines:
total_score = []*30
total_score = []*3000
total_score = []*300000000
Cause total_score to be instantiated as an empty list. It doesn't even have a 0th index, in this case! If you'd like to initiate every value to x in a list of length l , the syntax would look more like:
my_list = [x]*l
Alternatively, instead of thinking about the size before-hand, you can use .append instead of trying to access a particular index, as in:
my_list = []
my_list.append(200)
# my_list is now [200], my_list[0] is now 200
my_list.append(300)
# my_list is now [200,300], my_list[0] is still 200 and my_list[1] is now 300