CSV reading in python 2.7 - python-2.7

I wrote that code for csv reading, but right now I have this problem:
ValueError: invalid literal for float():
4.000E+00;3.125E-07;-7.854E-13
Here is my code:
import numpy as np
import matplotlib.pyplot as plt
def read_datafile(file_name):
data = np.loadtxt(file_name, delimiter=' ')
return data
for r in range(0,25,1):
data = read_datafile("S:\Dok\Python\Data\Codes\Model2\Mod{}.csv".format(r))
x = data[:,0]
y = data[:,1]
z = data[:,2]
degree = u"\u00b0"
fig = plt.figure(1)
plt.title("Model {}".format(r) + degree)
plt.plot(x, abs(y + 1j * z), color='k')
plt.show()
My files look like this:

You have np.loadtxt(file_name, delimiter=' '), but according to your error:
ValueError: invalid literal for float(): 4.000E+00;3.125E-07;-7.854E-13,
... it is semicolon-delimited.
Because of this, you are getting the whole line in at once, and to the conversion fails. If you change to delimiter=';', it should work.

Related

Retrun event.xdata from a function

I have made a small gui to select points on a given image, plot the points, interpolate them and save them. I would like to make it so that the interpolated points are given as output of the function gui_pos(image) retrun, but I couldn't find the way to do it. So far I have fixed it saving the interpolated point in a .pckl file but it is not a good solution.
The code is the following:
from PIL import Image
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
def interp_mia(x,y,xx):
f= interp1d(x,y,fill_value="extrapolate")
yy= f(xx)
return yy
def onclick(event):
plt.plot(event.xdata, event.ydata, '.')
fig.canvas.draw()
coordsx.append(event.xdata)
coordsy.append(event.ydata)
if (event.button == 3) :
xx = np.arange(np.min(coordsx),np.max(coordsx))
yy = interp_mia(coordsx, coordsy, xx)
print('you pressed', event.button)
plt.plot(xx,yy,'k-')
fig.canvas.draw()
fig.canvas.mpl_disconnect(cid)
dum = np.array((xx,yy))
f = open('gui_pos.pckl', 'wb')
pickle.dump(dum, f)
f.close()
def gui_pos(image):
global coordsx
global coordsy
global fig
global cid
global xx
global yy
global slit
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(image, origin="lower")
coordsx = []
coordsy = []
cid = fig.canvas.mpl_connect('button_press_event', onclick)
return
Any ideas?
Thank you,
Qarolina

How to reshape a numpy array from (#dim1,#dim2,#channel) to (#channel, #dim1,#dim2)

I have an array with the shape of (#dim1,#dim2,#channel). I want to reshape it to (#channel, #dim1,#dim2).
The plt.reshape(x, (#channel, #dim1,#dim2)) shows me a wrong image.
If you are using the Cifar10 dataset you could use the following code:
import numpy as np
import matplotlib.pyplot as plt
import cPickle
def unpickle(file):
with open(file, 'rb') as fo:
dict = cPickle.load(fo)
return dict
# Read the data
imageDict = unpickle('cifar-10-batches-py/data_batch_2')
imageArray = imageDict['data']
# Now we reshape
imageArray = np.swapaxes(imageArray.reshape(10000,32,32,3,order='F'), 1, 2)
# Get the labels
labels = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']
imageLabels = [labels[i] for i in imageDict['labels']]
# Plot some images
fig, ax = plt.subplots(4,4, figsize=(8,8))
for axIndex in [(i,j) for i in range(4) for j in range(4)]:
index = np.random.randint(0,10000)
ax[axIndex].imshow(imageArray[index], origin='upper')
ax[axIndex].set_title(imageLabels[index])
ax[axIndex].axis('off')
fig.show()
Which gives you:

Make a comma separated list of out of co-ordinates from a csv file

I have values x and y in a csv and i am reading those values and converting them into a numpy array using below code:
import numpy as np
import csv
data = np.loadtxt('datapoints.csv', delimiter=',')
# Putting data from csv file to variable
x = data[:, 0]
y = data[:, 1]
# Converting npArray to simple array
np.asarray(x)
np.asarray(y)
So, now i have the values of x and y.
But, i want them to be in this format:
[[x1,y1],[x2,y2], [x3,y3], ...... [xn,yn]]
How do i do that?
use zip :
result = [list(a) for a in zip(np.asarray(x),np.asarray(y))]

extracting and plotting atomic coordinates from pdb file python

I am trying to extract just the alpha carbon coordinates and plot them in a 3D representation. The top half of the following code works fine, but I can't seem to plot my results.
import re
import glob
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
coord = []
pattern = re.compile('ATOM\s{5,}\d+\s{2}CA\s{2,}\w{3}\s\w\s{2,}\d+\s{6}\d+\.\d+\s\d+\.\d+\s{2}\d+\.\d+', flags=re.S)
for file in glob.glob('file_rank_1.pdb'):
with open(file) as fp:
for result in pattern.findall(fp.read()):
output = result[-22:]
coord = " ".join(output.split())
coord = coord.replace(" ",",")
c = coord.split(',')
print(c)
X,Y,Z = (c)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X,Y,Z)
ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('z axis')
plt.show()
My results from running the above looks like...
['72.438', '109.283', '43.980']
['75.664', '110.907', '45.079']
['74.354', '111.094', '48.594']
['73.380', '107.449', '48.722']
['76.614', '106.603', '46.958']
['79.740', '105.625', '48.895']
['82.425', '107.703', '47.318']
['80.088', '110.405', '46.265']
['78.710', '110.389', '49.818']
['82.235', '110.471', '51.200']
['82.841', '113.550', '49.133']
['79.233', '114.754', '49.675']
['78.633', '113.745', '53.295']
['77.041', '117.182', '53.503']
['73.963', '116.530', '51.505']
['73.696', '113.058', '52.933']
TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'
The above code opens the graph interface, but it remains blank. There is also a full screen of red file messages from the interactive shell that I left off to try to save space in this question.
How can I plot the numbers found in c? Thanks
There are a few things to point out:
1) In the following block, c is a list of strings not floats.
with open(file) as fp:
for result in pattern.findall(fp.read()):
output = result[-22:]
coord = " ".join(output.split())
coord = coord.replace(" ",",")
c = coord.split(',')
print(c)
You can change them using:
[float(i) for i in c]
2) When you set X,Y,Z = (c), that c is only the last item in the loop. So you should append each c within the loop to collect all coordinates.
3) You might want to use numpy for array manipulations.
So hopefully the following will work:
import re
import numpy as np
import glob
from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
points = []
pattern = re.compile('ATOM\s{5,}\d+\s{2}CA\s{2,}\w{3}\s\w\s{2,}\d+\s{6}\d+\.\d+\s\d+\.\d+\s{2}\d+\.\d+', flags=re.S)
for file in glob.glob('file_rank_1.pdb'):
with open(file) as fp:
for result in pattern.findall(fp.read()):
output = result[-22:]
coord = " ".join(output.split())
coord = coord.replace(" ",",")
c = coord.split(',')
c = [float(i) for i in c] # change them to float
points.append(c)
print(c)
X,Y,Z=np.array(points).T
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_wireframe(X,Y,Z)
ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('z axis')
plt.show()

Feature selection in scikit learn for multiple variables and thousands+ features

I am trying to perform feature selection for logistic regression classifier. Originally there are 4 variables: name, location, gender, and label = ethnicity. The three variables, namely the name, give rise to tens of thousands of more "features", for example, name "John Snow" will give rise to 2-letter substrings like 'jo', 'oh', 'hn'... etc. The feature set undergoes DictVectorization.
I am trying to follow this tutorial (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html) but I am not sure if I am doing it right since the tutorial is using a small number of features while mine has tens of thousands after vectorization. And also the plt.show() shows a blank figure.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
import random
import time
from random import randint
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_avg_wordLength(nameString):
try:
space = 0
for i in nameString:
if i == ' ':
space += 1
length = float(len(nameString) - space)
name_entity = float(space + 1)
avg = round(float(length/name_entity), 0)
return avg
except:
return 0
def feature_name_entity(nameString2):
space = 0
try:
for i in nameString2:
if i == ' ':
space += 1
return space+1
except: return 0
def feature_gender(genString):
try:
gender = genString
if len(gender) >= 1:
return gender
else: return '?'
except: return '?'
def feature_noNeighborLoc(locString):
try:
x = re.sub(r'^[^, ]*', '', locString) # remove everything before and include first ','
y = x[2:] # remove subsequent ',' and ' '
return y
except: return '?'
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict13 = [{'name-entity': feature_name_entity(feature_full_name(i))} for i in X2]
my_dict14 = [{'avg-length': feature_avg_wordLength(feature_full_name(i))} for i in X]
my_dict15 = [{'gender': feature_full_name(i)} for i in X3]
my_dict16 = [{'location': feature_noNeighborLoc(feature_full_name(i))} for i in X4]
my_dict17 = [{'dummy1': 1} for i in X]
my_dict18 = [{'dummy2': random.randint(0,2)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(my_dict13[i].items() + my_dict14[i].items()
+ my_dict15[i].items() + my_dict16[i].items() + my_dict17[i].items() + my_dict18[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
half_cut = int(len(df)/2.0)*-1
X_train = newX[:half_cut]
X_test = newX[half_cut:]
y_train = y[:half_cut]
y_test = y[half_cut:]
# Fitting X and y into model, using training data
lr = LogisticRegression()
lr.fit(X_train, y_train)
dv = DictVectorizer()
# Feature selection
plt.figure(1)
plt.clf()
X_indices = np.arange(X_train.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
plt.show()
Warning:
E:\Program Files Extra\Python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [[0 0 0 ..., 0 0 0]] are constant.
It looks like the way you split your data into training and testing sets is not working:
# Separate the training and testing data sets
X_train = newX[:half_cut]
X_test = newX[half_cut:]
If you already use sklearn, it is much more convenient to use the builtin splitting routine for this:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)