ValueError on tensorflow while_loop shape invariants - python-2.7

import tensorflow as tf
cluster_size = tf.constant(6) # size of the cluster
m = tf.constant(6) # number of contigs (column size)
n = tf.constant(3) # number of points in a single contigs (column size)
contigs_index = tf.reshape(tf.range(0, m, 1, dtype=tf.int32), [1, -1])
contigs = tf.constant(
[[1.1, 2.2, 3.3], [6.6, 5.5, 4.4], [7.7, 8.8, 9.9], [11.1, 22.2, 33.3],
[66.6, 55.5, 44.4], [77.7, 88.8, 99.9]])
# pad zeo to the right till fixed length
def rpad_with_zero(points):
points = tf.slice(tf.pad(points, tf.reshape(tf.concat(
[tf.zeros([1, 2], tf.int32), tf.add(
tf.zeros([1, 2], tf.int32),
tf.subtract(cluster_size, tf.size(points)))], 0), [2, -1]), "CONSTANT"),
(0, tf.subtract(cluster_size, tf.size(points))),
(1, cluster_size))
return points
#calculate pearson correlation coefficient r value
def calculate_pcc(row, contigs):
r = tf.divide(tf.subtract(
tf.multiply(tf.to_float(n), tf.reduce_sum(tf.multiply(row, contigs), 1)),
tf.multiply(tf.reduce_sum(row, 1), tf.reduce_sum(contigs, 1))),
tf.multiply(
tf.sqrt(tf.subtract(
tf.multiply(tf.to_float(n), tf.reduce_sum(tf.square(row), 1)),
tf.square(tf.reduce_sum(row, 1)))),
tf.sqrt(tf.subtract(tf.multiply(
tf.to_float(n), tf.reduce_sum(tf.square(contigs), 1)),
tf.square(tf.reduce_sum(contigs, 1)))
)))
return r
#slice first row from contigs
row = tf.slice(contigs, (0, 0), (1, 3))
#calculate pcc
r = calculate_pcc(row, contigs)
#cluster member index whose r value is greater than 0.90, then casting to
# int32,
members0_index = tf.cast(tf.reshape(tf.where(tf.greater(r, 0.90)), [1, -1]),
tf.int32)
#members = index <intersection> members, padding the members index with
# zeros at right, to keep the fixed cluster length
members0_index = rpad_with_zero(
tf.reshape(tf.sets.set_intersection(contigs_index, members0_index).values,
[1, -1]))
#update index with the rest element index from contigs, and padding
contigs_index = rpad_with_zero(
tf.reshape(tf.sets.set_difference(contigs_index, members0_index).values,
[1, -1]))
#def condition(contigs, contigs_index, members0_index):
def condition(contigs_index, members0_index):
return tf.greater(tf.count_nonzero(contigs_index),
0) # iterate until there is a contig
#def body(contigs, contigs_index, members0_index):
def body(contigs_index, members0_index):
i = tf.reshape(tf.slice(contigs_index, [0, 0], [1, 1]),
[]) #the first element in the contigs_index
row = tf.slice(contigs, (i, 0),
(1, 3)) #slice the ith contig from contigs
r = calculate_pcc(row, contigs)
members_index = tf.cast(tf.reshape(tf.where(tf.greater(r, 0.90)), [1, -1]),
tf.int32)
members_index = rpad_with_zero(rpad_with_zero(
tf.reshape(tf.sets.set_intersection(contigs_index, members_index).values,
[1, -1])))
members0_index = tf.concat([members0_index, members_index], 0)
contigs_index = rpad_with_zero(
tf.reshape(tf.sets.set_difference(contigs_index, members_index).values,
[1, -1]))
#return [contigs, contigs_index, members0_index]
return [contigs_index, members0_index]
sess = tf.Session()
sess.run(tf.while_loop(condition, body,
#loop_vars=[contigs, contigs_index, members0_index],
loop_vars=[contigs_index, members0_index],
#shape_invariants=[contigs.get_shape(), contigs_index.get_shape(),
# tf.TensorShape([None, 6])]))
shape_invariants=[contigs_index.get_shape(), tf.TensorShape([None, 6])]))
The error is:
ValueError: The shape for while_12/Merge:0 is not an invariant for the
loop. It enters the loop with shape (1, 6), but has shape (?, ?) after
one iteration. Provide shape invariants using either the
shape_invariants argument of tf.while_loop or set_shape() on the
loop variables.
It seems the variable
contigs_index
is responsible, but i really don't know why! I unfold the loop execute each statement but could not find any shape mismatch!

shape_invariants=[contigs_index.get_shape(), tf.TensorShape([None, 6])])) should become shape_invariants=[tf.TensorShape([None, None]), tf.TensorShape([None, 6])])), to allow for shape changes of contigs_index variable (in the rpad_with_zero call).

Related

Python - Error using linprog ("Invalid input for linprog: provide a 3 x 2 array for bounds, not a 2 x 3 array")

I am trying to use the linprog in python to solve this problem:
# Minimize = (0.035*x1) + (0.015*x2) + (0.025*x3)
# x1+x2+x3=1.2
# 0<=x1<=0.7
# 0<=x2<=0.3
# 0<=x3<=0.5
c = [0.035, 0.015, 0.025] #objective function
A_eq = [[1, 1, 1]]
b = [1.2]
lb = (0, 0, 0)
up = (0.7, 0.3, 0.5)
from scipy.optimize import linprog
linprog(c, A_ub=None, b_ub=None, A_eq=A_eq, b_eq=b, bounds=[lb,up], method='interior-point', callback=None, options=None, x0=None)
However I am getting an error could you help me with that?
thanks a lot!
You sholud define correctly the bounds for each variable in the same order as the coefficients. In this case, they’re between zero and some number:
# Minimize = (0.035*x1) + (0.015*x2) + (0.025*x3)
# x1+x2+x3=1.2
# 0<=x1<=0.7
# 0<=x2<=0.3
# 0<=x3<=0.5
c = [0.035, 0.015, 0.025] #objective function
A_eq = [[1, 1, 1]]
b = [1.2]
x1_b = (0, 0.7)
x2_b = (0, 0.3)
x3_b = (0, 0.5)
from scipy.optimize import linprog
linprog(c, A_ub=None, b_ub=None, A_eq=A_eq, b_eq=b, bounds=[x1_b, x2_b,x3_b], method='interior-point', callback=None, options=None, x0=None)

Calculate two dimensional pairwise distance on a large numpy three dimensional array

I have a numpy array of 3 million points in the form of [pt_id, x, y, z]. The goal is to return all pairs of points that have an Euclidean distance two numbers min_d and max_d.
The Euclidean distance is between x and y and not on the z. However, I'd like to preserve the array with pt_id_from, pt_id_to, distance attributes.
I'm using scipy's dist to calculate the distances:
import scipy.spatial.distance
coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
['pt2', 2479539.000, 7287455.000, 4.900],
['pt3', 2479626.000, 7287458.000, 10.000],
['pt4', 2484097.000, 7292784.000, 8.800],
['pt5', 2484106.000, 7293079.000, 7.300],
['pt6', 2484095.000, 7292891.000, 11.100]])
dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
np.savetxt('test.out', scipy.spatial.distance.squareform(dists), delimiter=',')
What should I do to return an array of form: [pt_id_from, pt_id_to, distance]?
You simply create a new array from the data by looping through all the possible combinations. The itertools module is excellent for this.
n = coords_arr.shape[0] # number of points
D = scipy.spatial.distance.squareform(dists) # distance matrix
data = []
for i, j in itertools.combinations(range(n), 2):
pt_a = coords_arr[i, 0]
pt_b = coords_arr[j, 0]
d_ab = D[i,j]
data.append([pt_a, pt_b, d_ab])
result_arr = np.array(data)
If memory is a problem, you might want to change the distance lookup from using the huge matrix D to looking up the value directly in dists using the i and j index.
Well, ['pt1', 'pt2', distance_as_number] is not exactly possible. The closest you can get with mixed datatypes is a structured array but then you can't do things like result[:2,0]. You'll have to index field names and array indices separately like: result[['a','b']][0].
Here is my solution:
import numpy as np
import scipy.spatial.distance
coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
['pt2', 2479539.000, 7287455.000, 4.900],
['pt3', 2479626.000, 7287458.000, 10.000],
['pt4', 2484097.000, 7292784.000, 8.800],
['pt5', 2484106.000, 7293079.000, 7.300],
['pt6', 2484095.000, 7292891.000, 11.100]])
dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
# Create a shortcut for `coords_arr.shape[0]` which is basically
# the total amount of points, hence `n`
n = coords_arr.shape[0]
# `a` and `b` contain the indices of the points which were used to compute the
# distances in dists. In this example:
# a = [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4]
# b = [1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5, 4, 5, 5]
a = np.arange(n).repeat(np.arange(n-1, -1, -1))
b = np.hstack([range(x, n) for x in xrange(1, n)])
min_d = 1000
max_d = 10000
# Find out which distances are in range.
in_range = np.less_equal(min_d, dists) & np.less_equal(dists, max_d)
# Define the datatype of the structured array which will be the result.
dtype = [('a', '<f8', (3,)), ('b', '<f8', (3,)), ('dist', '<f8')]
# Create an empty array. We fill it later because it makes the code cleaner.
# Its size is given by the sum over `in_range` which is possible
# since True and False are equivalent to 1 and 0.
result = np.empty(np.sum(in_range), dtype=dtype)
# Fill the resulting array.
result['a'] = coords_arr[a[in_range], 1:4]
result['b'] = coords_arr[b[in_range], 1:4]
result['dist'] = dists[in_range]
print(result)
# In caste you don't want a structured array at all, this is what you can do:
result = np.hstack([coords_arr[a[in_range],1:],
coords_arr[b[in_range],1:],
dists[in_range, None]]).astype('<f8')
print(result)
The structured array:
[([2479539.0, 7287455.0, 4.9], [2484097.0, 7292784.0, 8.8], 7012.389393067102)
([2479539.0, 7287455.0, 4.9], [2484106.0, 7293079.0, 7.3], 7244.7819152821985)
([2479539.0, 7287455.0, 4.9], [2484095.0, 7292891.0, 11.1], 7092.75912462844)
([2479626.0, 7287458.0, 10.0], [2484097.0, 7292784.0, 8.8], 6953.856268287403)
([2479626.0, 7287458.0, 10.0], [2484106.0, 7293079.0, 7.3], 7187.909362255481)
([2479626.0, 7287458.0, 10.0], [2484095.0, 7292891.0, 11.1], 7034.873843929257)]
The ndarray:
[[2479539.0, 7287455.0, 4.9, 2484097.0, 7292784.0, 8.8, 7012.3893],
[2479539.0, 7287455.0, 4.9, 2484106.0, 7293079.0, 7.3, 7244.7819],
[2479539.0, 7287455.0, 4.9, 2484095.0, 7292891.0, 11.1, 7092.7591],
[2479626.0, 7287458.0, 10.0, 2484097.0, 7292784.0, 8.8, 6953.8562],
[2479626.0, 7287458.0, 10.0, 2484106.0, 7293079.0, 7.3, 7187.9093],
[2479626.0, 7287458.0, 10.0, 2484095.0, 7292891.0, 11.1, 7034.8738]]
You can use np.where to get a coords of distances within a range, then generate a new list in your format, filtering same pairs. Like this:
>>> import scipy.spatial.distance
>>> import numpy as np
>>> coords_arr = np.array([['pt1', 2452130.000, 7278106.000, 25.000],
... ['pt2', 2479539.000, 7287455.000, 4.900],
... ['pt3', 2479626.000, 7287458.000, 10.000],
... ['pt4', 2484097.000, 7292784.000, 8.800],
... ['pt5', 2484106.000, 7293079.000, 7.300],
... ['pt6', 2484095.000, 7292891.000, 11.100]])
>>>
>>> dists = scipy.spatial.distance.pdist(coords_arr[:,1:3], 'euclidean')
>>> dists = scipy.spatial.distance.squareform(dists)
>>> x, y = np.where((dists >= 8000) & (dists <= 30000))
>>> [(coords_arr[x[i]][0], coords_arr[y[i]][0], dists[y[i]][x[i]]) for i in xrange(len(x)) if x[i] < y[i]]
[('pt1', 'pt2', 28959.576688895162), ('pt1', 'pt3', 29042.897927032005)]

How to count the number of zeros in Python?

My code is currently written as:
convert = {0:0,1:1,2:2,3:3,4:0,5:1,6:2,7:1}
rows = [[convert[random.randint(0,7)] for _ in range(5)] for _ in range(5)]
numgood = 25 - rows.count(0)
print numgood
>> 25
It always comes out as 25, so it's not just that rows contains no 0's.
Have you printed rows?
It's [[0, 1, 0, 0, 2], [1, 2, 0, 1, 2], [3, 1, 1, 1, 1], [1, 0, 0, 1, 0], [0, 3, 2, 0, 1]], so you have a nested list there.
If you want to count the number of 0's in those nested lists, you could try:
import random
convert = {0:0, 1:1, 2:2, 3:3, 4:0, 5:1, 6:2, 7:1}
rows = [[convert[random.randint(0, 7)] for _ in range(5)] for _ in range(5)]
numgood = 25 - sum(e.count(0) for e in rows)
print numgood
Output:
18
rows doesn't contain any zeroes; it contains lists, not integers.
>>> row = [1,2,3]
>>> type(row)
<type 'list'>
>>> row.count(2)
1
>>> rows = [[1,2,3],[4,5,6]]
>>> rows.count(2)
0
>>> rows.count([1,2,3])
1
To count the number of zeroes in any of the lists in rows, you could use a generator expression:
>>> rows = [[1,2,3],[4,5,6], [0,0,8]]
>>> sum(x == 0 for row in rows for x in row)
2
You could also use numpy:
import numpy as np
import random
convert = {0:0,1:1,2:2,3:3,4:0,5:1,6:2,7:1}
rows = [[convert[random.randint(0,7)] for _ in range(5)] for _ in range(5)]
numgood = 25 - np.count_nonzero(rows)
print numgood
Output:
9

Pandas Dataframe ValueError: Shape of passed values is (X, ), indices imply (X, Y)

I am getting an error and I'm not sure how to fix it.
The following seems to work:
def random(row):
return [1,2,3,4]
df = pandas.DataFrame(np.random.randn(5, 4), columns=list('ABCD'))
df.apply(func = random, axis = 1)
and my output is:
[1,2,3,4]
[1,2,3,4]
[1,2,3,4]
[1,2,3,4]
However, when I change one of the of the columns to a value such as 1 or None:
def random(row):
return [1,2,3,4]
df = pandas.DataFrame(np.random.randn(5, 4), columns=list('ABCD'))
df['E'] = 1
df.apply(func = random, axis = 1)
I get the the error:
ValueError: Shape of passed values is (5,), indices imply (5, 5)
I've been wrestling with this for a few days now and nothing seems to work. What is interesting is that when I change
def random(row):
return [1,2,3,4]
to
def random(row):
print [1,2,3,4]
everything seems to work normally.
This question is a clearer way of asking this question, which I feel may have been confusing.
My goal is to compute a list for each row and then create a column out of that.
EDIT: I originally start with a dataframe that hase one column. I add 4 columns in 4 difference apply steps, and then when I try to add another column I get this error.
If your goal is add new column to DataFrame, just write your function as function returning scalar value (not list), something like this:
>>> def random(row):
... return row.mean()
and then use apply:
>>> df['new'] = df.apply(func = random, axis = 1)
>>> df
A B C D new
0 0.201143 -2.345828 -2.186106 -0.784721 -1.278878
1 -0.198460 0.544879 0.554407 -0.161357 0.184867
2 0.269807 1.132344 0.120303 -0.116843 0.351403
3 -1.131396 1.278477 1.567599 0.483912 0.549648
4 0.288147 0.382764 -0.840972 0.838950 0.167222
I don't know if it possible for your new column to contain lists, but it deinitely possible to contain tuples ((...) instead of [...]):
>>> def random(row):
... return (1,2,3,4,5)
...
>>> df['new'] = df.apply(func = random, axis = 1)
>>> df
A B C D new
0 0.201143 -2.345828 -2.186106 -0.784721 (1, 2, 3, 4, 5)
1 -0.198460 0.544879 0.554407 -0.161357 (1, 2, 3, 4, 5)
2 0.269807 1.132344 0.120303 -0.116843 (1, 2, 3, 4, 5)
3 -1.131396 1.278477 1.567599 0.483912 (1, 2, 3, 4, 5)
4 0.288147 0.382764 -0.840972 0.838950 (1, 2, 3, 4, 5)
I use the code below it is just fine
import numpy as np
df = pd.DataFrame(np.array(your_data), columns=columns)

How to return a list of a single tuple(words,value) from a list of tuples(word, values)?

I need help with a function that can return words that have 3 or more characters that are "evenly" spaced, that is the ord() value for consecutive letters left to right are even (same difference value). This is what I have so far... and the output is this:
test_list2 = ['i', 'made', 'an', 'ace', 'today', 'at', 'tennis...yeaah', 'booi', ':d']
for word in test_list2:
if len(word) >=3:
temp_list = []
for chr1 in word:
if word.index(chr1) != (len(word)-1):
chr2 = word.index(chr1)+1
num = ord(word[chr2]) - ord(chr1)
temp_list.append(num)
temp_tup = (word, temp_list)
final_list.append(temp_tup)
final_list = [('made', [-12, 3, 1]), ('ace', [2, 2]), ('today', [-5, -11, -3, 24]),
('tennis...yeaah', [-15, 9, 0, 0, 10, -69, 0, 0, 0, -20, 9, 0, 0]),
('booi', [13, 0, 0])]
But i need to return only the ones that are evenly spaced ('ace'). The output should be like this,
[('ace',2)]
Assuming that you do not need the final_list with non evenly spaced numbers, then you can keep track of the num to see if it stays the same throughout the word. If you find a different num stop and go to the next word. If num stays the same then add a (word, num) tuple to the final_list:
for word in test_list2:
if len(word) >=3:
all_nums_are_same = True
prev_num = None
for chr1 in word:
if word.index(chr1) != (len(word)-1):
chr2 = word.index(chr1)+1
num = ord(word[chr2]) - ord(chr1)
if not prev_num:
prev_num = num
elif prev_num != num:
# different number is found, we can
# stop and move on to next word
all_nums_are_same = False
break
if all_nums_are_same:
# only add tuple if all numbers we the same
temp_tup = (word, prev_num)
final_list.append(temp_tup)
This yields [('ace',2)] as a result.
I banged this out in Python 3.3, compiles and works on my machine :)
There's a bunch of extra debugging junk in there like print statements, if you want to test it with some more complicated data (example: long blocks of text) for bugs.
I made use of enumerate(), rather than your word.index, not sure which is more pythonic?
import sys
### Define variables
test_list = ['i', 'made', 'an', 'ace', 'today', 'at', 'tennis...yeaah', 'booi', ':d']
proc_check = [('made', [-12, 3, 1]),
('ace', [2, 2]),
('today', [-5, -11, -3, 24]),
('tennis...yeaah', [-15, 9, 0, 0, 10, -69, 0, 0, 0, -20, 9, 0, 0]),
('booi', [13, 0, 0])]
final_check = [('ace', [2,2])]
test_list2 = ['ace', 'ace', 'ace']
proc_check2 = [('ace', [2, 2]),
('poo', [3, 3]),
('ace', [2, 2])]
final_check2 = [('ace', [2,2]),('poo', [2,2]),('ace', [2,2])]
### Function definitions
def wordIsEven(word_list, proc_list_check):
final_list = []
procd_list = []
for word in word_list:
temp_list = []
if len(word) >= 3:
for chr1 in word:
if word.index(chr1) != (len(word)-1):
chr2 = word.index(chr1)+1
num = ord(word[chr2]) - ord(chr1)
temp_list.append(num)
temp_tup = (word, temp_list)
procd_list.append(temp_tup)
errors = False
for i, check in enumerate(procd_list):
if check != proc_list_check[i]:
errors = True
print("Word Eval Fail! " + str(check) + " != " + str(proc_list_check[i]))
if errors == True:
print("List compare failed!" )
else:
print("Lists compare equally!")
for tmp_tup in procd_list:
print("Examining Slice: "+str(tmp_tup[1]))
for i, num in enumerate(tmp_tup[1]):
if i + 1 < len(tmp_tup[1]):
num2 = tmp_tup[1][i+1]
if num == num2:
if num != 0:
print("Got one! " + str(tmp_tup))
final_list.append(tmp_tup)
return final_list
### Code execution
my_list = wordIsEven(test_list2, proc_check2)
my_check = final_check2
print("Printing Final list:")
for i, item in enumerate(my_list):
tempStr = str(item)
if item != my_check[i]:
tempStr += " doesn't match check data..." + str(my_check[i])
print(tempStr)
sys.exit()