How to predict a single new sample after dict-vectorization in python scikit-learn? - python-2.7

I am using logistic regression classifier to predict ethnicity class label 0, 1. My data is split into testing and training sample and got dict-vectorized into sparse matrix.
The following is the working codes, where I predict and validate X_train and X_test which are part of the features that got vectorized:
for i in mass[k]:
df = df_temp # reset df before each loop
#$$
if 1==1:
count+=1
ethnicity_tar = str(i)
############################################
############################################
def ethnicity_target(row):
try:
if row[ethnicity_var] == ethnicity_tar:
return 1
else:
return 0
except: return None
df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
print '1=', ethnicity_tar
print '0=', 'non-'+ethnicity_tar
# Random sampling a smaller dataframe for debugging
rows = df.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
df = DataFrame(rows)
print 'Class count:'
print df['ethnicity_scan'].value_counts()
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_full_last_name(nameString):
try:
last_name = nameString.rsplit(None, 1)[-1]
if len(last_name) > 1: # not accept name with only 1 character
return last_name
else: return '?'
except: return '?'
def feature_full_first_name(nameString):
try:
first_name = nameString.rsplit(' ', 1)[0]
if len(first_name) > 1: # not accept name with only 1 character
return first_name
else: return '?'
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(
my_dict[i].items() + my_dict5[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
# Fitting X and y into model, using training data
classifierUsed2.fit(X_train, y_train)
# Making predictions using trained data
y_train_predictions = classifierUsed2.predict(X_train)
y_test_predictions = classifierUsed2.predict(X_test)
However, I would like to predict just a single name for example "John Carter" and predict the ethnicity label. I replaced the y_train_predictions = classifierUsed2.predict(X_train) and y_train_predictions = classifierUsed2.predict(X_train) with the following line but resulting in error:
print classifierUsed2.predict(["John Carter"])
#error
Error: X has 1 features per sample; expecting 103916

You need to transform your data in the exact same way as a training one, thus something like (if your input data was just list of strings)
classifierUsed2.predict(dv.transform(["John Carter"]))

Related

Redis session does not store variables when modified in thread - Flask

I have a thread that is running inside a route where the thread job is to do some expensive work, store variables and than I'll need to use these variables in another flask route.
When I am using the session variable (Redis) as a parameter in the thread function in order to add the data and extract it later it does not find the variables that I have stored in it.
In contrast, when I declare a global_dict and pass it to the thread function instead of session, the code works great.
As the thread function can be used by multiple users simultaneously, storing it in a global_dict is not a good practice.
Why using session in my code does not work?
In the following code, if I replace global_dict with session I won't be able to access it in the /result route.
Per doc:
"Redis can handle up to 2^32 keys, and was tested in practice to handle at least 250 million keys per instance.
Every hash, list, set, and sorted set, can hold 2^32 elements.
In other words your limit is likely the available memory in your system."
P.S Sorry for the long code blocks.
#app.route("/build",methods=["GET", "POST"])
#login_required
def build():
if request.method == "POST":
global th
global finished
finished= False
#copy_current_request_context
def operation(global_dict):
global finished
symbols = request.form.get("symbols")
mc.set("symbols", symbols)
if contains_multiple_words(symbols) == False:
flash("The app purpose is to optimize a portfolio given a list of stocks. Please enter a list of stocks seperated by a new row.")
return redirect("/build")
Build(session["user_id"], symbols.upper(), request.form.get("start"), request.form.get("end"), request.form.get("funds"), request.form.get("short"), request.form.get("volatility"), request.form.get("gamma"), request.form.get("return"))
db.session.commit()
try:
df = yf.download(symbols, start=request.form.get("start"), end=request.form.get("end"), auto_adjust = False, prepost = False, threads = True, proxy = None)["Adj Close"].dropna(axis=1, how='all')
failed=(list(shared._ERRORS.keys()))
df = df.replace(0, np.nan)
try:
global_dict['listofna']=df.columns[df.isna().iloc[-2]].tolist()+failed
except IndexError:
flash("Please enter valid stocks from Yahoo Finance.")
return redirect("/build")
df = df.loc[:,df.iloc[-2,:].notna()]
except ValueError:
flash("Please enter a valid symbols (taken from Yahoo Finance)")
return redirect("/build")
def enter_sql_data(app, df, nasdaq_exchange_info, Stocks):
for ticker in df.columns:
ticker=ticker.upper()
if any(sublist[1]==ticker in sublist for sublist in nasdaq_exchange_info) is False:
ticker_ln = yf.Ticker(ticker).stats()["price"].get('longName')
if not ticker_ln:
ticker_ln = ticker
ticker_list=[ticker_ln, ticker]
with app.app_context():
new_stock=Stocks(ticker, ticker_ln)
db.session.add(new_stock)
db.session.commit()
nasdaq_exchange_info.extend([ticker_list])
global nasdaq_exchange_info
app1 = app._get_current_object()
p1 = Process(target=enter_sql_data, args=[app1, df, nasdaq_exchange_info, Stocks])
p1.start()
prices = df.copy()
fig = px.line(prices, x=prices.index, y=prices.columns, title='Price Graph')
fig = fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(width=1350, height=900)
global_dict['plot_json'] = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
exp_cov = risk_models.exp_cov(prices, frequency=252)
#plotting the covariance matrix
heat = go.Heatmap(
z = risk_models.cov_to_corr(exp_cov),
x = exp_cov.columns.values,
y = exp_cov.columns.values,
zmin = 0, # Sets the lower bound of the color domain
zmax = 1,
xgap = 1, # Sets the horizontal gap (in pixels) between bricks
ygap = 1,
colorscale = 'RdBu'
)
title = 'Covariance matrix'
layout = go.Layout(
title_text=title,
title_x=0.5,
width=800,
height=800,
xaxis_showgrid=False,
yaxis_showgrid=False,
yaxis_autorange='reversed'
)
fig1=go.Figure(data=[heat], layout=layout)
fig1.update_layout(width=500, height=500)
global_dict['plot_json1'] = json.dumps(fig1, cls=plotly.utils.PlotlyJSONEncoder)
S = risk_models.CovarianceShrinkage(prices).ledoit_wolf()
heat = go.Heatmap(
z = risk_models.cov_to_corr(S),
x = S.columns.values,
y = S.columns.values,
zmin = 0, # Sets the lower bound of the color domain
zmax = 1,
xgap = 1, # Sets the horizontal gap (in pixels) between bricks
ygap = 1,
colorscale = 'RdBu'
)
title = 'Ledoit-Wolf shrinkage'
layout = go.Layout(
title_text=title,
title_x=0.5,
width=800,
height=800,
xaxis_showgrid=False,
yaxis_showgrid=False,
yaxis_autorange='reversed'
)
fig2=go.Figure(data=[heat], layout=layout)
fig2.update_layout(width=500, height=500)
global_dict['plot_json2'] = json.dumps(fig2, cls=plotly.utils.PlotlyJSONEncoder)
#Section 2 -Return estimation
#it is often a bad idea to provide returns using a simple estimate like the mean of past returns. Research suggests that better off not providing expected returns – you can then just find the min_volatility() portfolio or use HRP.
mu = pypfopt.expected_returns.capm_return(prices)
fig3 = px.bar(mu, orientation='h')
fig3.update_layout(width=700, height=500)
global_dict['plot_json3'] = json.dumps(fig3, cls=plotly.utils.PlotlyJSONEncoder)
#using risk models optimized for the Efficient frontier to reduce to min volitility, good for crypto currencies - not implemented in the website now.
ef = EfficientFrontier(None, S)
try:
ef.min_volatility()
weights = ef.clean_weights()
nu = pd.Series(weights)
fig4 = px.bar(nu, orientation='h')
fig4.update_layout(width=700, height=500)
global_dict['plot_json4'] = json.dumps(fig4, cls=plotly.utils.PlotlyJSONEncoder)
av=ef.portfolio_performance()[1]
global_dict['av']=round(av, 3)*1
#if we want to buy the portfolio mentioned above
df = df.iloc[[-1]]
for col in df.columns:
if col.endswith(".L"):
df.loc[:,col] = df.loc[:,col]*GBPtoUSD()
try:
latest_prices = df.iloc[-1]
except IndexError:
flash("There is an issue with Yahoo API please try again later")
return redirect("/")
# prices as of the day you are allocating
if float(request.form.get("funds")) <= 0 or float(request.form.get("funds")) == " ":
flash("Amount need to be a positive number")
return redirect("/build")
if float(request.form.get("funds")) < float(latest_prices.min()):
flash("Amount is not high enough to cover the lowest priced stock")
return redirect("/build")
try:
da = DiscreteAllocation(weights, latest_prices, total_portfolio_value=float(request.form.get("funds")))
except TypeError:
delisted=df.columns[df.isna().any()].tolist()
delisted= ", ".join(delisted)
flash("Can't get latest prices for the following stock/s, please remove to contiue : %s" % delisted)
return redirect("/build")
alloc, global_dict['leftover'] = da.lp_portfolio()
global_dict['alloc']=alloc
global_dict['latest_prices']=latest_prices
except ValueError:
pass
#Maximise return for a given risk, with L2 regularisation
try:
ef = EfficientFrontier(mu, S)
ef.add_objective(objective_functions.L2_reg, gamma=(float(request.form.get("gamma")))) # gamme is the tuning parameter
ef.efficient_risk(int(request.form.get("volatility"))/100)
weights = ef.clean_weights()
su = pd.DataFrame([weights])
fig5 = px.pie(su, values=weights.values(), names=su.columns)
fig5.update_traces(textposition='inside')
fig5.update_layout(width=500, height=500, uniformtext_minsize=12, uniformtext_mode='hide')
global_dict['plot_json5'] = json.dumps(fig5, cls=plotly.utils.PlotlyJSONEncoder)
global_dict['perf'] =ef.portfolio_performance()
except Exception as e:
flash(str(e))
return redirect("/build")
#if we want to buy the portfolio mentioned above
for col in df.columns:
if col.endswith(".L"):
df.loc[:,col] = df.loc[:,col]*GBPtoUSD()
latest_prices1 = df.iloc[-1] # prices as of the day you are allocating
if float(request.form.get("funds")) <= 0 or float(request.form.get("funds")) == " ":
flash("Amount need to be a positive number")
return redirect("/build")
if float(request.form.get("funds")) < float(latest_prices.min()):
flash("Amount is not high enough to cover the lowest priced stock")
return redirect("/build")
da = DiscreteAllocation(weights, latest_prices, total_portfolio_value=float(request.form.get("funds")))
alloc1, global_dict['leftover1'] = da.lp_portfolio()
global_dict['alloc1']=alloc1
global_dict['latest_prices1']=latest_prices1
#Efficient semi-variance optimization
returns = pypfopt.expected_returns.returns_from_prices(prices)
returns = returns.dropna()
es = EfficientSemivariance(mu, returns)
try:
es.efficient_return(float(request.form.get("return"))/100)
except ValueError as e:
flash(str(e))
return redirect("/build")
global_dict['perf2']=es.portfolio_performance()
weights = es.clean_weights()
#if we want to buy the portfolio mentioned above
for col in df.columns:
if col.endswith(".L"):
df.loc[:,col] = df.loc[:,col]*GBPtoUSD()
latest_prices2 = df.iloc[-1] # prices as of the day you are allocating
if float(request.form.get("funds")) <= 0 or float(request.form.get("funds")) == " ":
flash("Amount need to be a positive number")
return redirect("/build")
if float(request.form.get("funds")) < float(latest_prices.min()):
flash("Amount is not high enough to cover the lowest priced stock")
return redirect("/build")
da = DiscreteAllocation(weights, latest_prices, total_portfolio_value=float(request.form.get("funds")))
alloc2, global_dict['leftover2'] = da.lp_portfolio()
global_dict['alloc2']=alloc2
global_dict['latest_prices2']=latest_prices2
mc.delete("symbols")
global_dict['ret']=float(request.form.get("return"))
global_dict['gamma']=request.form.get("gamma")
global_dict['volatility']=request.form.get("volatility")
finished = True
global global_dict
th = Thread(target=operation, args=[global_dict])
th.start()
return render_template("loading.html")
else:
if mc.get("symbols"):
cached_symbols=mc.get("symbols")
else:
cached_symbols=''
availableCash=db.session.query(Users.cash).filter_by(id=session["user_id"]).first().cash
return render_template("build.html", availableCash=round(availableCash, 4), GBP=GBPtoUSD(), nasdaq_exchange_info=nasdaq_exchange_info, cached_symbols=cached_symbols, top_50_crypto=top_50_crypto, top_world_stocks=top_world_stocks, top_US_stocks=top_US_stocks, top_div=top_div)
app.route('/result')
def result():
return render_template("built.html",av=global_dict['av'], leftover=global_dict['leftover'], alloc=global_dict['alloc'], ret=global_dict['ret'],gamma=global_dict['gamma'],volatility=global_dict['volatility'],perf=global_dict['perf'], perf2=global_dict['perf2'], alloc1=global_dict['alloc1'], alloc2=global_dict['alloc2'], plot_json=global_dict['plot_json'], plot_json1=global_dict['plot_json1'], plot_json2=global_dict['plot_json2'], plot_json3=global_dict['plot_json3'], plot_json4=global_dict['plot_json4'], plot_json5=global_dict['plot_json5'], leftover1=global_dict['leftover1'], leftover2=global_dict['leftover2'],listofna=(', '.join(global_dict['listofna'])))

tensorflow error: InvalidArgumentError: Shape mismatch in tuple component 1. Expected [1], got [5]

I am trying to construct a batch of (wav_file, label) pair.
wav file labels and paths are listed in dev.csv.
below code is not working,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import tensorflow as tf
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 5
global record_defaults
record_defaults = [['/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def read_record(filename_queue, num_records):
reader = tf.TextLineReader()
key, value = reader.read_up_to(filename_queue, num_records)
wav_filename, duration, transcript = tf.decode_csv(value, record_defaults, field_delim=",")
wav_reader = tf.WholeFileReader()
wav_key, wav_value = wav_reader.read_up_to(tf.train.string_input_producer(wav_filename, shuffle=False, capacity=num_records), num_records)
return [wav_key, transcript] # throw errors
# return [wav_key, wav_value] # works
# return [wav_filename, duration, transcript] # works
data_queue = tf.train.string_input_producer(tf.train.match_filenames_once('dev.csv'), shuffle=False)
batch_data = [read_record(data_queue, batch_size) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, batch_size=batch_size, capacity=capacity, enqueue_many=True)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
print(coord)
threads = tf.train.start_queue_runners(coord=coord)
print("threads num: " + str(threads))
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
print("line:", step, feat)
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
throw errors below, how can I fix it?:
dev.csv content as below:
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_101.wav,8.26,qi shi nian dai mo wo wai chu qiu xue
/Users/phoenix/workspace/dataset/data_thchs30/dev/A11_119.wav,6.9,chen yun tong shi yao qiu gan bu men ren zhen xue xi
I tried to rewrite your code like this.
This is my observation.
The error is no longer thrown. And the values are returned.
An obvious discrepancy is that the size of the batch for transcript is double that specified. So it is 4 instead of 2. It doubles for some reason. No such problem for the audio binary.
shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)] is based on an error I saw which mentioned that I have to specify this using TensorShape. I didn't find the documentation of any help but it is mentioned there.
shapes: (Optional.) A list of fully-defined TensorShape objects with the same length as dtypes, or None.
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.DEBUG)
FLAGS = tf.app.flags.FLAGS
threads = 1
batch_size = 2
record_defaults = [['D:/male.wav'], ['8.26'], ['七十 年代 末 我 外出 求学 母亲 叮咛 我 吃饭 要 细嚼慢咽 学习 要 深 钻 细 研']]
def readbatch(data_queue) :
reader = tf.TextLineReader()
_, rows = reader.read_up_to(data_queue, batch_size)
wav_filename, duration, transcript = tf.decode_csv(rows, record_defaults,field_delim=",")
audioreader = tf.WholeFileReader()
_, audio = audioreader.read( tf.train.string_input_producer(wav_filename) )
return [audio,transcript]
data_queue = tf.train.string_input_producer(
tf.train.match_filenames_once('D:/Book1.csv'), shuffle=False)
batch_data = [readbatch(data_queue) for _ in range(threads)]
capacity = threads * batch_size
batch_values = tf.train.batch_join(batch_data, shapes=[tf.TensorShape(()),tf.TensorShape(batch_size,)], capacity=capacity, batch_size=batch_size, enqueue_many=False )
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
sess.run(tf.initialize_local_variables())
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
try:
step = 0
while not coord.should_stop():
step += 1
feat = sess.run([batch_values])
audio = feat[0][0]
print ('Size of audio is ' + str(audio.size))
script = feat[0][1]
print ('Size of script is ' + str(script.size))
except tf.errors.OutOfRangeError:
print(' training for 1 epochs, %d steps', step)
finally:
coord.request_stop()
coord.join(threads)
A sample dataset proves that there is an extra pair.
[[array([b'Text2', b'Text1'], dtype=object), array([[b'Translation-1', b'Translation-2'],
[b'Translation-1', b'Translation-2']], dtype=object)]]

ValueError: Tensor Tensor("Const:0", shape=(), dtype=float32) may not be fed with tf.placeholder

I'm trying to make speech recognition system with tensorflow.
Input data is an numpy array of size 50000 X 1.
Output data (mapping data) is an numpy array of size 400 X 1.
Input and mapping data is passed in batches of 2 in a list.
I've used this tutorial to design the neural network. Following is the code snippet:
For RNN:
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
For feeding data:
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
When I ran the code, I got this error:
Traceback (most recent call last):
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 205, in <module>
tr_losses, te_losses = train_network(g)
File "/home/wolborg/PycharmProjects/speech-to-text-rnn/src/rnn_train_1.py", line 177, in train_network
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 895, in run
run_metadata_ptr)
File "/home/wolborg/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1102, in _run
raise ValueError('Tensor %s may not be fed.' % subfeed_t)
ValueError: Tensor Tensor("Const:0", shape=(), dtype=float32) may not be fed.
Process finished with exit code 1
Earlier, I was facing this issue with tf.sparse_placeholder, then after some browsing, I changed input type to tf.placeholder and made related changes. Now I'm clueless on where I'm making the error.
Please suggest something as how should I feed data.
Entire code:
import tensorflow as tf
# for taking MFCC and label input
import numpy as np
import rnn_input_data_1
import sound_constants
# input constants
# Training Parameters
num_input = 10 # mfcc data input
training_data_size = 8 # determines number of files in training and testing module
testing_data_size = num_input - training_data_size
# Network Parameters
learning_rate = 0.0001 # for large training set, it can be set 0.001
num_hidden = 200 # number of hidden layers
num_classes = 28 # total alphabet classes (a-z) + extra symbols (', ' ')
epoch = 1 # number of iterations
batch_size = 2 # number of batches
mfcc_coeffs, text_data = rnn_input_data_1.mfcc_and_text_encoding()
class DataGenerator:
def __init__(self, data_size):
self.ptr = 0
self.epochs = 0
self.data_size = data_size
def next_batch(self):
self.ptr += batch_size
if self.ptr > self.data_size:
self.epochs += 1
self.ptr = 0
return mfcc_coeffs[self.ptr-batch_size : self.ptr], text_data[self.ptr-batch_size : self.ptr]
def reset_graph():
if 'sess' in globals() and sess:
sess.close()
tf.reset_default_graph()
def struct_network():
print ('Inside struct network !!')
reset_graph()
input_data = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_DATA, sound_constants.MAX_COLUMN_SIZE_IN_DATA], name="train_input")
target = tf.placeholder(tf.float32, [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT], name="train_output")
keep_prob = tf.constant(1.0)
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating one backward cell
bkwd_cell = tf.nn.rnn_cell.BasicLSTMCell(num_hidden, state_is_tuple=True, forget_bias=1.0)
# creating bidirectional RNN
val, _, _ = tf.nn.static_bidirectional_rnn(fwd_cell, bkwd_cell, tf.unstack(input_data), dtype=tf.float32)
# adding dropouts
val = tf.nn.dropout(val, keep_prob)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
# creating bidirectional RNN
print ('BiRNN created !!')
print ('Last Size: ', last.get_shape())
weight = tf.Variable(tf.truncated_normal([num_hidden * 2, sound_constants.MAX_ROW_SIZE_IN_TXT]))
bias = tf.Variable(tf.constant(0.1, shape=[sound_constants.MAX_ROW_SIZE_IN_TXT]))
# mapping to 28 output classes
logits = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax(logits)
prediction = tf.reshape(prediction, shape = [batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
# getting probability distribution
mat1 = tf.cast(tf.argmax(prediction,1),tf.float32)
correct = tf.equal(prediction, target)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
logits = tf.reshape(logits, shape=[batch_size, sound_constants.MAX_ROW_SIZE_IN_TXT, sound_constants.MAX_COLUMN_SIZE_IN_TXT])
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=target))
train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)
# returning components as dictionary elements
return {'input_data' : input_data,
'target' : target,
'dropout': keep_prob,
'loss': loss,
'ts': train_step,
'preds': prediction,
'accuracy': accuracy
}
def train_network(graph):
# initialize tensorflow session and all variables
# tf_gpu_config = tf.ConfigProto(allow_soft_placement = True, log_device_placement = True)
# tf_gpu_config.gpu_options.allow_growth = True
# with tf.Session(config = tf_gpu_config) as sess:
with tf.Session() as sess:
train_instance = DataGenerator(training_data_size)
test_instance = DataGenerator(testing_data_size)
print ('Training data size: ', train_instance.data_size)
print ('Testing data size: ', test_instance.data_size)
sess.run(tf.global_variables_initializer())
print ('Starting session...')
step, accuracy = 0, 0
tr_losses, te_losses = [], []
current_epoch = 0
while current_epoch < epoch:
step += 1
trb = train_instance.next_batch()
feed = {g['input_data'] : trb[0], g['target'] : trb[1], g['dropout'] : 0.6}
accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
accuracy += accuracy_
if train_instance.epochs > current_epoch:
current_epoch += 1
tr_losses.append(accuracy / step)
step, accuracy = 0, 0
#eval test set
te_epoch = test_instance.epochs
while test_instance.epochs == te_epoch:
step += 1
print ('Testing round ', step)
trc = test_instance.next_batch()
feed = {g['input_data']: trc[0], g['target']: trc[1]}
accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
accuracy += accuracy_
te_losses.append(accuracy / step)
step, accuracy = 0,0
print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])
return tr_losses, te_losses
g = struct_network()
tr_losses, te_losses = train_network(g)
You defined keep_prob as a tf.constant, but then trying to feed the value into it. Replace keep_prob = tf.constant(1.0) with keep_prob = tf.placeholder(tf.float32,[]) or keep_prob = tf.placeholder_with_default(1.0,[])

Plot in tensorboard is always closes and like a circle

I was trying to plot a loss curve, but is always abnormal (just like a circle, I really don't know how to describe it in English properly), I had found many topics about question like this and just can't solve, my tensorflow version is 0.10.0.
import tensorflow as tf
from tensorflow.core.util.event_pb2 import SessionLog
import os
# initialize variables/model parameters
# define the training loop operations
def inputs():
# read/generate input training data X and expected outputs Y
weight_age = [[84,46],[73,20],[65,52],[70,30],[76,57],[69,25],[63,28],[72,36],[79,57],[75,44],[27,24]
,[89,31],[65,52],[57,23],[59,60],[69,48],[60,34],[79,51],[75,50],[82,34],[59,46],[67,23],
[85,37],[55,40],[63,30]]
blodd_fat_content = [354,190,405,263,451,302,288,385,402,365,209,290,346,
254,395,434,220,374,308,220,311,181,274,303,244]
return tf.to_float(weight_age), tf.to_float(blodd_fat_content)
def inference(X):
# compute inference model over data X and return the result
return tf.matmul(X, W) + b
def loss(X, Y):
# compute loss over training data X and expected outputs Y
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted))
def train(total_loss):
# train / adjust model parameters according to computed total loss
learning_rate = 1e-7
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)
def evaluate(sess, X, Y):
# evaluate the resulting trained model
print (sess.run(inference([[80., 25.]])))
print (sess.run(inference([[60., 25.]])))
g1 = tf.Graph()
with tf.Session(graph=g1) as sess:
W = tf.Variable(tf.zeros([2,1]), name="weights")
b = tf.Variable(0., name="bias")
tf.initialize_all_variables().run()
X, Y = inputs()
print (sess.run(W))
total_loss = loss(X, Y)
train_op = train(total_loss)
tf.scalar_summary("loss", total_loss)
summaries = tf.merge_all_summaries()
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
summary_writer = tf.train.SummaryWriter('linear', g1)
summary_writer.add_session_log(session_log= SessionLog(status=SessionLog.START), global_step=1)
# actual training loop
training_steps = 100
tolerance = 100
total_loss_last = 0
initial_step = 0
# Create a saver.
saver = tf.train.Saver()
# verify if we don't have a checkpoint saved already
ckpt = tf.train.get_checkpoint_state(os.path.dirname('my_model'))
if ckpt and ckpt.model_checkpoint_path:
# Restores from checkpoint
saver.restore(sess, ckpt.model_checkpoint_path)
initial_step = int(ckpt.model_checkpoint_path.rsplit('-', 1)[1])
# summary_writer.add_session_log(SessionLog(status=SessionLog.START), global_step=initial_step)
for step in range(initial_step, training_steps):
sess.run([train_op])
if step%20 == 0:
saver.save(sess, 'my-model', global_step=step)
gap = abs(sess.run(total_loss) - total_loss_last)
total_loss_last = sess.run(total_loss)
summary_writer.add_summary(sess.run(summaries), step)
# for debugging and learning purposes, see how the loss gets decremented thru training steps
if step % 10 == 0:
print ("loss: ", sess.run([total_loss]))
print("step: ", step)
if gap < tolerance:
break
# evaluation...
evaluate(sess, X, Y)
coord.request_stop()
coord.join(threads)
saver.save(sess, 'my-model', global_step=training_steps)
summary_writer.flush()
sess.close()

Feature selection in scikit learn for multiple variables and thousands+ features

I am trying to perform feature selection for logistic regression classifier. Originally there are 4 variables: name, location, gender, and label = ethnicity. The three variables, namely the name, give rise to tens of thousands of more "features", for example, name "John Snow" will give rise to 2-letter substrings like 'jo', 'oh', 'hn'... etc. The feature set undergoes DictVectorization.
I am trying to follow this tutorial (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html) but I am not sure if I am doing it right since the tutorial is using a small number of features while mine has tens of thousands after vectorization. And also the plt.show() shows a blank figure.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
import random
import time
from random import randint
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_avg_wordLength(nameString):
try:
space = 0
for i in nameString:
if i == ' ':
space += 1
length = float(len(nameString) - space)
name_entity = float(space + 1)
avg = round(float(length/name_entity), 0)
return avg
except:
return 0
def feature_name_entity(nameString2):
space = 0
try:
for i in nameString2:
if i == ' ':
space += 1
return space+1
except: return 0
def feature_gender(genString):
try:
gender = genString
if len(gender) >= 1:
return gender
else: return '?'
except: return '?'
def feature_noNeighborLoc(locString):
try:
x = re.sub(r'^[^, ]*', '', locString) # remove everything before and include first ','
y = x[2:] # remove subsequent ',' and ' '
return y
except: return '?'
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict13 = [{'name-entity': feature_name_entity(feature_full_name(i))} for i in X2]
my_dict14 = [{'avg-length': feature_avg_wordLength(feature_full_name(i))} for i in X]
my_dict15 = [{'gender': feature_full_name(i)} for i in X3]
my_dict16 = [{'location': feature_noNeighborLoc(feature_full_name(i))} for i in X4]
my_dict17 = [{'dummy1': 1} for i in X]
my_dict18 = [{'dummy2': random.randint(0,2)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(my_dict13[i].items() + my_dict14[i].items()
+ my_dict15[i].items() + my_dict16[i].items() + my_dict17[i].items() + my_dict18[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
half_cut = int(len(df)/2.0)*-1
X_train = newX[:half_cut]
X_test = newX[half_cut:]
y_train = y[:half_cut]
y_test = y[half_cut:]
# Fitting X and y into model, using training data
lr = LogisticRegression()
lr.fit(X_train, y_train)
dv = DictVectorizer()
# Feature selection
plt.figure(1)
plt.clf()
X_indices = np.arange(X_train.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
plt.show()
Warning:
E:\Program Files Extra\Python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [[0 0 0 ..., 0 0 0]] are constant.
It looks like the way you split your data into training and testing sets is not working:
# Separate the training and testing data sets
X_train = newX[:half_cut]
X_test = newX[half_cut:]
If you already use sklearn, it is much more convenient to use the builtin splitting routine for this:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)