comparing two lists of unequal length at each index - list

I have two lists of unequal length such as
list1 = ['G','T','C','A','G']
list2 = ['AAAAA','TTTT','GGGG','CCCCCCCC']
I want to compare these two lists at each index only against the corresponding positions i.e list2[0] against list1[0] and list2[1] against list1[1] and so on upto the length of list1.
And get two new lists one having the mismatches and the second having the position of mismatches for example in the language of coding it can be stated as :
if 'G' == 'GGG' or 'G' # where 'G' is from list1[1] and 'GGG' is from list2[2]
elif 'G' == 'AAA'
{
outlist1 == list1[index] # postion of mismatch
outlist2 == 'G/A'
}

ok this works. There are definitely ways to do it in less code, but I think this is pretty clear:
#Function to process the lists
def get_mismatches(list1,list2):
#Prepare the output lists
mismatch_list = []
mismatch_pos = []
#Figure out which list is smaller
smaller_list_len = min(len(list1),len(list2))
#Loop through the lists checking element by element
for ind in range(smaller_list_len):
elem1 = list1[ind][0] #First char of string 1, such as 'G'
elem2 = list2[ind][0] #First char of string 2, such as 'A'
#If they match just continue
if elem1 == elem2:
continue
#If they don't match update the output lists
else:
mismatch_pos.append(ind)
mismatch_list.append(elem1+'/'+elem2)
#Return the output lists
return mismatch_list,mismatch_pos
#Make input lists
list1 = ['G','T','C','A','G']
list2 = ['AAAAA','TTTT','GGGG','CCCCCCCC']
#Call the function to get the output lists
outlist1,outlist2 = get_mismatches(list1,list2)
#Print the output lists:
print outlist1
print outlist2
Output:
['G/A', 'C/G', 'A/C']
[0, 2, 3]
And just to see how short I could get the code I made this function which I think is equivalent:
def short_get_mismatches(l1,l2):
o1,o2 = zip(*[(i,x[0]+'/'+y[0]) for i,(x,y) in enumerate(zip(l1,l2)) if x[0] != y[0]])
return list(o1),list(o2)
#Make input lists
list1 = ['G','T','C','A','G']
list2 = ['AAAAA','TTTT','GGGG','CCCCCCCC']
#Call the function to get the output lists
outlist1,outlist2 = short_get_mismatches(list1,list2)
EDIT:
I'm not sure if I'm cleaning the sequence as you want w/ the N's and -'s. Is this the answer to the example in your comment?
Unclean list1 ['A', 'T', 'G', 'C', 'A', 'C', 'G', 'T', 'C', 'G']
Clean list1 ['A', 'T', 'G', 'C', 'A', 'C', 'G', 'T', 'C', 'G']
Unclean list2 ['GGG', 'TTTN', '-', 'NNN', 'AAA', 'CCC', 'GCCC', 'TTT', 'CCCTN']
Clean list2 ['GGG', 'TTT', 'AAA', 'CCC', 'GCCC', 'TTT', 'CCCT']
0 A GGG
1 T TTT
2 G AAA
3 C CCC
4 A GCCC
5 C TTT
6 G CCCT
['A/G', 'G/A', 'A/G', 'C/T', 'G/C']
[0, 2, 4, 5, 6]

this works fine for my question:
#!/usr/bin/env python
list1=['A', 'T', 'G', 'C', 'A' ,'C', 'G' , 'T' , 'C', 'G']
list2=[ 'GGG' , 'TTTN' , ' - ' , 'NNN' , 'AAA' , 'CCC' , 'GCCC' , 'TTT' ,'CCCATN' ]
notifications = []
indexes = []
for i in range(min(len(list1), len(list2))):
item1 = list1[i]
item2 = list2[i]
# Skip ' - '
if item2 == ' - ':
continue
# Remove N since it's a wildcard
item2 = item2.replace('N', '')
# Remove item1
item2 = item2.replace(item1, '')
chars = set(item2)
# All matched
if len(chars) == 0:
continue
notifications.append('{}/{}'.format(item1, '/'.join(set(item2))))
indexes.append(i)
print(notifications)
print(indexes)
It gives the output as
['A/G', 'G/C', 'C/A/T']
[0, 6, 8]

Related

Is there a way to group_by with_index in Crystal?

So I have this (nicely sorted) array.
And sometimes I need all of the elements from the array. But other times I need all of the even-indexed members together and all of the odd-indexed members together. And then again, sometimes I need it split into three groups with indices 0,3,6 etc. in one group, then 1,4,7 in the next and finally 2,5,8 in the last.
This can be done with group_by and taking the modulus of the index. See for yourself:
https://play.crystal-lang.org/#/r/4kzj
arr = ['a', 'b', 'c', 'd', 'e']
puts arr.group_by { |x| arr.index(x).not_nil! % 1 } # {0 => ['a', 'b', 'c', 'd', 'e']}
puts arr.group_by { |x| arr.index(x).not_nil! % 2 } # {0 => ['a', 'c', 'e'], 1 => ['b', 'd']}
puts arr.group_by { |x| arr.index(x).not_nil! % 3 } # {0 => ['a', 'd'], 1 => ['b', 'e'], 2 => ['c']}
But that not_nil! in there feels like a code-smell / warning that there's a better way.
Can I get the index of the elements without needing to look it up and handle the Nil type?
You can also just do:
arr = ['a', 'b', 'c', 'd', 'e']
i = 0
puts arr.group_by { |x| i += 1; i % 1 }
i = 0
puts arr.group_by { |x| i += 1; i % 2 }
i = 0
puts arr.group_by { |x| i += 1; i % 3 }
Besides the nilable return type, it's also very inefficient to call Array#index for each element. This means a runtime of O(N²).
#group_by is used for grouping by value, but you don't need the value for grouping as you just want to group by index. That can be done a lot easier than wrapping around #group_by and #index
A more efficient solution is to loop over the indices and group the values based on the index:
groups = [[] of Char, [] of Char]
arr.each_index do |i|
groups[i % 2] << arr[i]
end
There is no special method for this, but it's fairly simple to implement yourself.
If you don't need all groups, but only one of them, you can also use Int32#step to iterate every other index:
group = [] of Char
2.step(to: arr.size - 1, by: 3) do |i|
group << arr[i]
end

Merging to lists of different length to for list of dictionaries

Hi please help me develop a logic which does following.
list_1 = [1,2,3]
list_2 = [a,b,c,d,e,f,g,h,i]
Required output (List of dictionaries):
output = [{1:a,2:b,3:c}, {1:d,2:e,3:f}, {1:g,2:h,3:i}]
My script:
return_list = []
k = 0
temp_dict = {}
for i, value in enumerate(list_2):
if k <= len(list_1)-1:
temp_dict[list_1[k]] = value
if k == len(list_1)-1:
k = 0
print temp_dict
return_list.append(temp_dict)
print return_list
print '\n'
else:
k = k + 1
print return_list
My output:
{1: 'a', 2: 'b', 3: 'c'}
[{1: 'a', 2: 'b', 3: 'c'}]
{1: 'd', 2: 'e', 3: 'f'}
[{1: 'd', 2: 'e', 3: 'f'}, {1: 'd', 2: 'e', 3: 'f'}]
{1: 'g', 2: 'h', 3: 'i'}
[{1: 'g', 2: 'h', 3: 'i'}, {1: 'g', 2: 'h', 3: 'i'}, {1: 'g', 2: 'h', 3: 'i'}]
[{1: 'g', 2: 'h', 3: 'i'}, {1: 'g', 2: 'h', 3: 'i'}, {1: 'g', 2: 'h', 3: 'i'}]
As you can see, temp_dict is getting printed correctly, but return_list is the last temp_dict 3 times.
please help to solve.
The issue here is that you are not reseting temp_dict to a new object.
When you append it to the list, it still maintains the reference to the dict object, after you change it on the next loop, it changes the value on the array because it's the same reference.
If you reset the value it should work
list_1 = [1,2,3]
list_2 = ['a','b','c','d','e','f','g','h','i']
return_list = []
k = 0
temp_dict = {}
for i, value in enumerate(list_2):
if k <= len(list_1)-1:
temp_dict[list_1[k]] = value
if k == len(list_1)-1:
k = 0
print temp_dict
return_list.append(temp_dict)
temp_dict = {} # Here is the change
print return_list
print '\n'
else:
k = k + 1
print return_list
This should work:
list_1 = [1,2,3]
list_2 = ['a','b','c','d','e','f','g','h','i']
output = []
j = 0
for i in range(1, len(list_1) + 1):
output.append(dict(zip(list_1, list_2[j:i * 3])))
j = i * 3
print(output)
The assumption is that you second list is exactly 3 times larger than the first list.
def merge_them(list1, list2):
output = []
i = 0
while i < len(list_2):
output.append(dict(zip(list_1, list_2[i: i + len(list1)])))
i += len(list1)
return output
and you can test it:
test1:
list_1 = [1,2,3]
list_2 = ['a','b','c','d','e','f','g','h','i']
print merge_them(list_1, list_2)
you will get:
[{1: 'a', 2: 'b', 3: 'c'}, {1: 'd', 2: 'e', 3: 'f'}, {1: 'g', 2: 'h', 3: 'i'}]
test2:
list_1 = [1,2,3,]
list_2 = ['a','b','c','d','e']
print merge_them(list_1, list_2)
you will get:
[{1: 'a', 2: 'b', 3: 'c'}, {1: 'd', 2: 'e'}]
test3:
list_1 = [1,2,3,4,5,6,7,8]
list_2 = ['a','b','c','d','e']
print merge_them(list_1, list_2)
you will get:
[{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e'}]

Appending a list built using conditional and appended values to a list of lists

I'm currently working with two large csv files of numerical data. One such csv, which we will call X, is composed entirely of numerical data for test subjects. The columns of a are arranged as health measurements like so (id, v1,v2,v3,v4). I am trying to take this information and create a list of lists where each list contains the information for a single person i.e as in this fashion:
X=[['1','a','b','c','d'],
['1','e','f','g','h'],
['2','i','j','k','l'],
['3','m','n','o','p']]
listoflists=[ [['1','a','b','c','d'],['1','e','f','g','h']], #first row
['2','i','j','k','l'], #second
['3','m','n','o','p'] ] #third
(let me know if i should edit the formatting: i wanted to present X as columns for readability. On list of lists I just ran out of room, so listolists = [ a,b,c], where a is the first row, b is the second, and c is third
I've tried something to the effect of this, but my biggest issue is I'm not sure where to create the list of those entities with matching data and then append it to the "master list".
#create a set that holds the values of the subject ids.
ids=list(set([item[0] for item in X]))
#create the list of lists i want
listolists=[]
for value in ids:
listolists.append(sublist)
for i in range(len(X))
sublist=[] #I'm not sure where to create sublists of
#matching data and append to listolists
if value == X[i][0]
sublist.append(X[i]
All help is appreciated. thanks.
Here is something:
X =[
['1','a','b','c','d'],
['1','e','f','g','h'],
['2','i','j','k','l'],
['3','m','n','o','p'],
]
numbers = {x[0] for x in X}
output = []
for num in sorted(numbers):
new_list = [sub_list for sub_list in X if sub_list[0] == num]
output.append(new_list)
print(output)
...
[[['1', 'a', 'b', 'c', 'd'], ['1', 'e', 'f', 'g', 'h']],
[['2', 'i', 'j', 'k', 'l']],
[['3', 'm', 'n', 'o', 'p']]]
If you need to 2nd and third list not nested like the first let me know
EDIT - for exact format specified in your question
X =[
['1','a','b','c','d'],
['1','e','f','g','h'],
['2','i','j','k','l'],
['3','m','n','o','p'],
]
numbers = {x[0] for x in X}
output = []
for num in sorted(numbers):
new_list = [sub_list for sub_list in X if sub_list[0] == num]
if len(new_list) > 1:
output.append(new_list)
else:
output.append((new_list)[0])
print(output)

Filter and limit on a python dictionary

Given:
obj = {}
obj['a'] = ['x', 'y', 'z']
obj['b'] = ['x', 'y', 'z', 'u', 't']
obj['c'] = ['x']
obj['d'] = ['y', 'u']
How do you select (e.g. print) the top 2 entries in this dictionary, sorted by the length of each list?
the top 2 entries in this dictionary, sorted by the length of each
list
print(sorted(obj.values(), key=len)[:2])
The output:
[['x'], ['y', 'u']]

How do I Pythonically print a list with formatting?

I have a list:
L = [1, 2, 3, 4, 5, 6]
and I want to print
1 B 2 J 3 C 4 A 5 J 6 X
from that list.
How do I do that?
Do I have to make another list and zip them up, or is there some way I can have the letters in my format specifier?
You could do it either way:
L = [1, 2, 3, 4, 5, 6]
from itertools import chain
# new method
print "{} B {} J {} C {} A {} J {} X".format(*L)
# old method
print "%s B %s J %s C %s A %s J %s X" % tuple(L)
# without string formatting
print ' '.join(chain.from_iterable(zip(map(str, L), 'BJCAJX')))
See the docs on str.format and string formatting.
A nice way to do this is have a dictionary of numbers to prefixes:
prefixes = {1: 'B', 2: 'J', 3: 'C', 4: 'A', 5: 'J', 6: 'X'}
Then you can do:
print ' 'join('%s %s' % (num, prefix) for num, prefix in prefixes.itervalues())
If you also have a list of letters:
nums = [1, 2, 3, 4, 5, 6]
ltrs = ['B', 'J', 'C', 'A', 'J', 'X']
print ' '.join('%s %s' % (num, ltr) for num, ltr in zip(nums, ltrs)