Pyomo error when passing rule from objective - pyomo

I am running the following pyomo code
C = list(datadict.keys())
model = ConcreteModel()
model.IDX = range(23)
model.zIDX = range(1)
def _initialize_rule(model, i):
return datadict[C[i]]['Int']
def _bounds_rule(model, i):
return (datadict[C[i]]['Min'], datadict[C[i]]['Max'])
# declare decision variables
model.x = Var(model.IDX, initialize=_initialize_rule, domain=NonNegativeReals, bounds=_bounds_rule)
model.z = Var(model.zIDX, initialize=1, domain=NonNegativeReals, bounds=(0, None))
model.c1 = Constraint(
expr = sum(model.x[i]*datadict[C[i]]["A"] for i in model.IDX) == budget
)
def _maa_rule(m):
v = BlockVector(23)
for i in m.IDX:
if channelData[C[i]]["A"]==0:
v.set_block(i, 1)
else:
v.set_block(i, m.x[i])
inputArr = v.flatten()
# Month
inputArr = np.append(inputArr, [dateData['Month']])
# DOW
inputArr = np.append(inputArr, [dateData['DOW']])
# DOY
inputArr = np.append(inputArr, [dateData['DOY']])
# Quarter
inputArr = np.append(inputArr, [dateData['Quarter']])
# fracDOY
inputArr = np.append(inputArr, [dateData['fracDOY']])
X_arr = X_scaler.transform(inputArr)
I_arr = Y_Inflow_scaler.inverse_transform(automlInflow.predict(X_arr).reshape(-1, 1))
O_arr = Y_Outflow_scaler.inverse_transform(automlOutflow.predict(X_arr).reshape(-1, 1))
m.z[i] = I_arr[0][0] - O_arr[0][0]
return m.z[i]
model.obj = Objective(rule=_maa_rule, sense=maximize)
But I get the following error when I try to run the code...
ERROR: Rule failed when generating expression for Objective obj with index
None: AssertionError: Blocks need to be numpy arrays or BlockVectors
ERROR: Constructing component 'obj' from data=None failed: AssertionError:
Blocks need to be numpy arrays or BlockVectors
---------------------------------------------------------------------------
AssertionError: Blocks need to be numpy arrays or BlockVectors
I have tried creating a blockVector and then assigning the predict values to it, but nothing seems to work.
Any help would be GREATLY appreciated.

Related

boost.python : pandas dataframe to c++

I want to use boost.python to use multi-index columns dataframe in c++.
※multi-index columns dataframe is like
I changed the type of multi-index columns dataframe into csv.
My csv file looks like this on spreadsheet
The reason why I want to use this data is for backtest. This is my backtest code in python that I want to translate to c++.
import pandas as pd
import numpy as np
from utils import load_data, load_list_csv, to_int
class No_Strategy():
def __init__(self, codes, unit, cash, position):
self.codes = codes
self.unit = unit
self.cash = cash
self.buy_signal = [0]*len(codes)
self.sell_signal = [0]*len(codes)
self.valid = 0
self.position = position
self.pass_st = 0 # 전략에 들어가지도 못한 경우
def set_data(self, prev_fs_row, fs_row, indi_row):
self.prev_fs = prev_fs_row
self.fs = fs_row # multi dimensional df
self.indi = indi_row
def _strat(self, prev_fs, curr_fs, curr_indi):
curr_rev = prev_rev = curr_ni = prev_ni = ni_growth = curr_asset = noncurr_asset = curr_asset_rat = 0
try:
prev_rev = int(prev_fs['매출액'].replace(",",""))
curr_rev = int(curr_fs['매출액'].replace(",",""))
except:
self.pass_st += 1
return 0, 0
rev_growth=(curr_rev-prev_rev)/prev_rev
try:
prev_ni = int(prev_fs['당기순이익'].replace(",",""))
curr_ni = int(curr_fs['당기순이익'].replace(",",""))
except:
self.pass_st += 1
return 0, 0
ni_growth=(curr_ni-prev_ni)/prev_ni
try:
curr_asset = int(curr_fs['유동자산'].replace(",",""))
noncurr_asset = int(curr_fs['비유동자산'].replace(",",""))
except:
self.pass_st += 1
return 0, 0
curr_asset_rat = curr_asset / noncurr_asset
#### this is the buy strategy! You can change the below ####
if (curr_indi.golden_cross) or (curr_indi.rsi_k < 0.65) :
return 1, 0
#### ************************************************** ####
#### this is the sell strategy! You can change the below ####
if (curr_indi.dead_cross):
return 0, 1
#### ************************************************** ####
return 0, 0
def run(self):
for i, code in enumerate(self.codes):
self.valid = 0
prev_fs = self.prev_fs[code]
curr_fs = self.fs[code]
curr_indi = self.indi[code]
prev_fs_cell = None
curr_fs_cell = None
try:
prev_fs_cell = prev_fs.iloc[0].replace(",","")
try:
curr_fs_cell = curr_fs.iloc[0].replace(",","")
except:
self.pass_st += 1
pass
except:
self.pass_st += 1
pass
if (curr_fs_cell != None) & (prev_fs_cell != None):
self.valid = 1
buy, sell = self._strat(prev_fs, curr_fs, curr_indi)
if self.valid == 0:
self.pass_st += 1
continue
else: # buy or sell signal get
price = curr_indi['close']
if buy:
if self.cash >= self.unit * price:
self.buy_signal[i] = self.unit
self.position[i] += self.unit
self.cash -= price * self.unit
elif sell:
if self.position[i] > 0 :
sell_num = self.position[i] - int(self.position[i]/2)
self.sell_signal[i] = sell_num
self.position[i] = int(self.position[i]/2) # 1-> 1 sell, 4 -> 2 sell ....
self.cash += price * sell_num
##title
class Broker():
def __init__(self, codes):
self.cash = 200000000 #2억
self.cash_df = None #pd.DataFrame(columns=['cash'])
self.position = [0]*len(codes)
self.position_df = None #pd.DataFrame(columns=codes) # for accumulated profit calculation
self.buy_signal = None #pd.DataFrame(columns=codes) # codes = KOSPI_stock_names
self.sell_signal = None #pd.DataFrame(columns=codes)
self.codes = codes # 012934, 3281, ...
self.unit = 1 # 주식 매매 단위
self.pass_st = 0
def set_strat(self, strategy):
self.strategy = strategy # class
def set_time(self, time_index): # time_index type: pd.Index / time range for indi df
self.buy_signal = pd.DataFrame(columns = self.codes, index = time_index) #set_index(time_index)
self.sell_signal = pd.DataFrame(columns = self.codes, index = time_index) #.set_index(time_index)
self.position_df = pd.DataFrame(columns = self.codes, index = time_index)
self.cash_df = pd.DataFrame(columns = ['cash'], index = time_index)#.set_index(time_index)
self.time_index = time_index
def set_data(self, fs, indi, price):
self.fs = fs # multi dimensional df / start: 0th - nth
self.indi = indi # multi dimensional df / start : 1th - nth
self.price = price # 2 dimensional (date X codes : close price)
def update_data(self, strategy, date):
self.cash = strategy.cash
self.cash_df.loc[date] = strategy.cash
self.position = strategy.position
self.position_df.loc[date] = strategy.position #list
self.buy_signal.loc[date] = strategy.buy_signal #list
self.sell_signal.loc[date] = strategy.sell_signal #list
self.pass_st += strategy.pass_st
def run(self):
for date in self.time_index: #아마 수정해야 할 확률 높음
if date.year == 2021:
break
else:
prev_fs_row = self.fs.loc[date.year-1] # ex: 2014
fs_row = self.fs.loc[date.year] # 2015
indi_row = self.indi.loc[date] # 2015
strategy = self.strategy(self.codes, self.unit, self.cash, self.position)
strategy.set_data(prev_fs_row, fs_row, indi_row)
strategy.run()
self.update_data(strategy, date)
def performance(self):
# !!!! 2020년까지의 결과만 성능 평가 ####
cash_df = self.cash_df[self.cash_df.index < '2021']
position_df = self.position_df[self.position_df.index < '2021']
price = self.price[self.price.index < '2021']
buy_signal = self.buy_signal[self.buy_signal.index < '2021']
sell_signal = self.sell_signal[self.sell_signal.index < '2021']
last_price = price.iloc[-1]
total_remain_num = self.position # last(2020) position data
total_buy = (price * buy_signal).sum(axis=1).sum()
total_sell = (price * sell_signal).sum(axis=1).sum()
total_remain = (last_price * total_remain_num).sum()
print(f'remain 개수: {total_remain_num}, total_remain: {total_remain} total_buy: {total_buy}, total_sell={total_sell}')
profit = total_sell + total_remain - total_buy
try:
return_mean = profit / total_buy
except:
print("no buy")
return
accum_df = (cash_df['cash'] + ((price.fillna(0) * position_df).sum(axis=1))).to_frame() # row sum
daily_return_df = (accum_df - accum_df.shift(1))/accum_df.shift(1)-1
SSE = ((daily_return_df - return_mean)**2).sum().item()
std = np.sqrt(SSE/(accum_df.shape[0]-1)) # route(sigma(x-x_bar)^2 / (n-1))
sharp = return_mean / std
self.return_mean = return_mean
self.sharp = sharp
print(f'return_mean: {return_mean}, sharp: {sharp}')
code_path = GDRIVE_DATA_PATH + 'codes.csv'
fs_path = GDRIVE_DATA_PATH + 'fs_total.csv'
indi_path = GDRIVE_DATA_PATH + 'indi_total.csv'
price_path = GDRIVE_DATA_PATH + 'prices.csv'
fs_total = load_data("fs_total.csv")
indi_total = load_data("indi_total.csv") # stock price and indicator(Golden cross, RSI, etc.)
prices = load_data("prices.csv") # stock close price data rows:date, cols: stock code.
time_index = indi_total.index # time index of indi_total multi-index columns
broker = Broker(codes)
broker.set_strat(No_Strategy)
broker.set_time(time_index)
broker.set_data(fs_total, indi_total, prices)
broker.run()
broker.performance()
I want to translate it not changing much in flow of the code.
But I cannot find how to get multi-index columns dataframe in c++, and transfer its row data to No_Strategy to decide whether invest into the stock.
※ I uploaded similar question before and get thankful answer, but it is too complicated for me so I question one more time with detail information.
look at https://github.com/hosseinmoein/DataFrame. It has about 95% of Pandas functionality in a much faster framework

PYOMO: How to use abstract models with internal data

Hei all,
I am trying to set up an abstract model for a very simple QP of the form
min (x-x0)^2
s.t.
A x = b
C x <= d
I would like to use an abstract model, as I need to resolve with changing parameters (mainly x0, but potentially also A, b, C, d). I am right now struggeling with simply setting the parameters in the model instance. I do not want to use an external data file, but rather internal python variables. All examples I find online use AMPL formatted data files.
This is the code I have right now
import pyomo.environ as pe
model = pe.AbstractModel()
# the sets
model.n = pe.Param(within=pe.NonNegativeIntegers)
model.m = pe.Param(initialize = 1)
model.ss = pe.RangeSet(1, model.n)
model.os = pe.RangeSet(1, model.m)
# the starting point and the constraint parameters
model.x_hat = pe.Param(model.ss)
model.A = pe.Param(model.os, model.ss)
model.b = pe.Param(model.os)
model.C = pe.Param(model.os, model.os)
model.d = pe.Param(model.ss, model.os)
# the decision variables
model.x_projected = pe.Var(model.ss)
# the cosntraints
# A x = b
def sum_of_elements_rule(model):
value = model.A * model.x_projected
return value == model.d
model.sumelem = pe.Constraint(model.os, rule=sum_of_elements_rule)
# C x <= d
def positivity_constraint(model):
return model.C*model.x_projected <= model.d
model.bounds = pe.Constraint(model.ss, rule=positivity_constraint)
# the cost
def cost_rule(model):
return sum((model.x_projected[i] - model.x[i])**2 for i in model.ss)
model.cost = pe.Objective(rule=cost_rule)
instance = model.create_instance()
And somehow here I am stuck. How do I set the parameters now?
Thanks and best, Theo
I know this is an old post but a solution to this could have helped me so here is the solution to this problem:
## TEST
data_init= {None: dict(
n = {None : 3},
d = {0:0, 1:1, 2:2},
x_hat = {0:10, 1:-1, 2:-100},
b = {None: 10}
)}
# create instance
instance = model.create_instance(data_init)
This creates the instance in an equivalent way than what you did but in a more formal way.
Ok, I seemed to have figured out what the problem is. If I want to set a parameter after I create an instance, I need the
mutable=True
flag. Then, I can set the parameter with something like
for i in range(model_dimension):
getattr(instance, 'd')[i] = i
The model dimension I need to choose before i create an instance (which is ok for my case). The instance can be reused with different parameters for the constraints.
The code below should work for the problem
min (x-x_hat)' * (x-x_hat)
s.t.
sum(x) = b
x[i] >= d[i]
with x_hat, b, d as parameters.
import pyomo.environ as pe
model = pe.AbstractModel()
# model dimension
model.n = pe.Param(default=2)
# state space set
model.ss = pe.RangeSet(0, model.n-1)
# equality
model.b = pe.Param(default=5, mutable=True)
# inequality
model.d = pe.Param(model.ss, default=0.0, mutable=True)
# decision var
model.x = pe.Var(model.ss)
model.x_hat = pe.Param(model.ss, default=0.0, mutable=True)
# the cost
def cost_rule(model):
return sum((model.x[i] - model.x_hat[i])**2 for i in model.ss)
model.cost = pe.Objective(rule=cost_rule)
# CONSTRAINTS
# each x_i bigger than d_i
def lb_rule(model, i):
return (model.x[i] >= model.d[i])
model.state_bound = pe.Constraint(model.ss, rule=lb_rule)
# sum of x == P_tot
def sum_rule(model):
return (sum(model.x[i] for i in model.ss) == model.b)
model.state_sum = pe.Constraint(rule=sum_rule)
## TEST
# define model dimension
model_dimension = 3
model.n = model_dimension
# create instance
instance = model.create_instance()
# set d
for i in range(model_dimension):
getattr(instance, 'd')[i] = i
# set x_hat
xh = (10,1,-100)
for i in range(model_dimension):
getattr(instance, 'x_hat')[i] = xh[i]
# set b
instance.b = 10
# solve
solver = pe.SolverFactory('ipopt')
result = solver.solve(instance)
instance.display()

Reformulating the AMPL car example

I am trying migrating the ampl car problem that comes in the Ipopt source code tarball as example. I am having got problems with the end condition (reach a place with zero speed at final iteration) and with the cost function (minimize final time).
Can someone help me revise the following model?
# min tf
# dx/dt = 0
# dv/dt = a - R*v^2
# x(0) = 0; x(tf) = 100
# v(0) = 0; v(tf) = 0
# -3 <= a <= 1 (a is the control variable)
#!Python3.5
from pyomo.environ import *
from pyomo.dae import *
N = 20;
T = 10;
L = 100;
m = ConcreteModel()
# Parameters
m.R = Param(initialize=0.001)
# Variables
def x_init(m, i):
return i*L/N
m.t = ContinuousSet(bounds=(0,1000))
m.x = Var(m.t, bounds=(0,None), initialize=x_init)
m.v = Var(m.t, bounds=(0,None), initialize=L/T)
m.a = Var(m.t, bounds=(-3.0,1.0), initialize=0)
# Derivatives
m.dxdt = DerivativeVar(m.x, wrt=m.t)
m.dvdt = DerivativeVar(m.v, wrt=m.t)
# Objetives
m.obj = Objective(expr=m.t[N])
# DAE
def _ode1(m, i):
if i==0:
return Constraint.Skip
return m.dxdt[i] == m.v[i]
m.ode1 = Constraint(m.t, rule=_ode1)
def _ode2(m, i):
if i==0:
return Constraint.Skip
return m.dvdt[i] == m.a[i] - m.R*m.v[i]**2
m.ode2 = Constraint(m.t, rule=_ode2)
# Constraints
def _init(m):
yield m.x[0] == 0
yield m.v[0] == 0
yield ConstraintList.End
m.init = ConstraintList(rule=_init)
'''
def _end(m, i):
if i==N:
return m.x[i] == L amd m.v[i] == 0
return Constraint.Skip
m.end = ConstraintList(rule=_end)
'''
# Discretize
discretizer = TransformationFactory('dae.finite_difference')
discretizer.apply_to(m, nfe=N, wrt=m.t, scheme='BACKWARD')
# Solve
solver = SolverFactory('ipopt', executable='C:\\EXTERNOS\\COIN-OR\\win32-msvc12\\bin\\ipopt')
results = solver.solve(m, tee=True)
Currently, a ContinuousSet in Pyomo has to be bounded. This means that in order to solve a minimum time optimal control problem using this tool, the problem must be reformulated to remove the time scaling from the ContinuousSet. In addition, you have to introduce an extra variable to represent the final time. I've added an example to the Pyomo github repository showing how this can be done for your problem.

Pyomo: Extending the "car ampl example" with additional constraints

After having seen the nice implementation of the "ampl car example" in Pyomo repository, I would like to keep extending the problem with new features and constraints, but I have found the next problems during development. Is someone able of fix them?
1) Added new constraint "electric car": Now the acceleration is limited by adherence until a determined speed and then constant power model is used. I am not able of implement this constraint as i would think. It is commented in the, but Pyomo complains about that a constraint is related to a variable. (now Umax depends of the car speed).
2) Added new comfort acceleration and jerk constraints. It seems they are working right, but should be nice if a Pyomo guru supervise them and tell me if they are really implemented in the correct way.
3) About last one, in order of reducing verbosity. Is there any way of combine accelerationL and accelerationU in a unique constraint? Same for jerkL and jerkU.
4) The last feature is a speed limit constraint divided in two steps. Again, I am not able of getting it works, so it is commented in code. Does anybody dare to fix it?
# Ampl Car Example (Extended)
#
# Shows how to convert a minimize final time optimal control problem
# to a format pyomo.dae can handle by removing the time scaling from
# the ContinuousSet.
#
# min tf
# dx/dt = v
# dv/dt = u - R*v^2
# x(0)=0; x(tf)=L
# v(0)=0; v(tf)=0
# -3 <= u <= 1 (engine constraint)
#
# {v <= 7m/s ===> u < 1
# u <= { (electric car constraint)
# {v > 7m/s ===> u < 1*7/v
#
# -1.5 <= dv/dt <= 0.8 (comfort constraint -> smooth driving)
# -0.5 <= d2v/dt2 <= 0.5 (comfort constraint -> jerk)
# v <= Vmax (40 kmh[0-500m] + 25 kmh(500-1000m])
from pyomo.environ import *
from pyomo.dae import *
m = ConcreteModel()
m.R = Param(initialize=0.001) # Friction factor
m.L = Param(initialize=1000.0) # Final position
m.T = Param(initialize=50.0) # Estimated time
m.aU = Param(initialize=0.8) # Acceleration upper bound
m.aL = Param(initialize=-1.5) # Acceleration lower bound
m.jU = Param(initialize=0.5) # Jerk upper bound
m.jL = Param(initialize=-0.5) # Jerk lower bound
m.NFE = Param(initialize=100) # Number of finite elements
'''
def _initX(m, i):
return m.x[i] == i*m.L/m.NFE
def _initV(m):
return m.v[i] == m.L/50
'''
m.tf = Var()
m.tau = ContinuousSet(bounds=(0,1)) # Unscaled time
m.t = Var(m.tau) # Scaled time
m.x = Var(m.tau, bounds=(0,m.L))
m.v = Var(m.tau, bounds=(0,None))
m.u = Var(m.tau, bounds=(-3,1), initialize=0)
m.dt = DerivativeVar(m.t)
m.dx = DerivativeVar(m.x)
m.dv = DerivativeVar(m.v)
m.da = DerivativeVar(m.v, wrt=(m.tau, m.tau))
m.obj = Objective(expr=m.tf)
def _ode1(m, i):
if i==0:
return Constraint.Skip
return m.dt[i] == m.tf
m.ode1 = Constraint(m.tau, rule=_ode1)
def _ode2(m, i):
if i==0:
return Constraint.Skip
return m.dx[i] == m.tf * m.v[i]
m.ode2 = Constraint(m.tau, rule=_ode2)
def _ode3(m, i):
if i==0:
return Constraint.Skip
return m.dv[i] == m.tf*(m.u[i] - m.R*m.v[i]**2)
m.ode3 = Constraint(m.tau, rule=_ode3)
def _accelerationL(m, i):
if i==0:
return Constraint.Skip
return m.dv[i] >= m.aL*m.tf
m.accelerationL = Constraint(m.tau, rule=_accelerationL)
def _accelerationU(m, i):
if i==0:
return Constraint.Skip
return m.dv[i] <= m.aU*m.tf
m.accelerationU = Constraint(m.tau, rule=_accelerationU)
def _jerkL(m, i):
if i==0:
return Constraint.Skip
return m.da[i] >= m.jL*m.tf**2
m.jerkL = Constraint(m.tau, rule=_jerkL)
def _jerkU(m, i):
if i==0:
return Constraint.Skip
return m.da[i] <= m.jU*m.tf**2
m.jerkU = Constraint(m.tau, rule=_jerkU)
'''
def _electric(m, i):
if i==0:
return Constraint.Skip
elif value(m.v[i])<=7:
return m.a[i] <= 1
else:
return m.v[i] <= 1*7/m.v[i]
m.electric = Constraint(m.tau, rule=_electric)
'''
'''
def _speed(m, i):
if i==0:
return Constraint.Skip
elif value(m.x[i])<=500:
return m.v[i] <= 40/3.6
else:
return m.v[i] <= 25/3.6
m.speed = Constraint(m.tau, rule=_speed)
'''
def _initial(m):
yield m.x[0] == 0
yield m.x[1] == m.L
yield m.v[0] == 0
yield m.v[1] == 0
yield m.t[0] == 0
m.initial = ConstraintList(rule=_initial)
discretizer = TransformationFactory('dae.finite_difference')
discretizer.apply_to(m, nfe=value(m.NFE), wrt=m.tau, scheme='BACKWARD')
#discretizer = TransformationFactory('dae.collocation')
#discretizer.apply_to(m, nfe=value(m.NFE), ncp=4, wrt=m.tau, scheme='LAGRANGE-RADAU')
solver = SolverFactory('ipopt')
solver.solve(m,tee=True)
print("final time = %6.2f" %(value(m.tf)))
t = []
x = []
v = []
a = []
u = []
for i in m.tau:
t.append(value(m.t[i]))
x.append(value(m.x[i]))
v.append(3.6*value(m.v[i]))
a.append(10*value(m.u[i] - m.R*m.v[i]**2))
u.append(10*value(m.u[i]))
import matplotlib.pyplot as plt
plt.plot(x, v, label='v (km/h)')
plt.plot(x, a, label='a (dm/s2)')
plt.plot(x, u, label='u (dm/s2)')
plt.xlabel('distance')
plt.grid('on')
plt.legend()
plt.show()
Thanks a lot in advance,
Pablo
(1) You should not think of Pyomo constraint rules as callbacks that are used by the solver. You should think of them more as a function to generate a container of constraint objects that gets called once for each index when the model is constructed. Meaning it is invalid to use a variable in an if statement unless you are really only using its initial value to define the constraint expression. There are ways to express what I think you are trying to do, but they involve introducing binary variables into the problem, in which case you can no longer use Ipopt.
(2) Can't really provide any help. Syntax looks fine.
(3) Pyomo allows you to return double-sided inequality expressions (e.g., L <= f(x) <= U) from constraint rules, but they can not involve variable expressions in the L and U locations. It doesn't look like the constraints you are referring to can be combined into this form.
(4) See (1)

How to predict a single new sample after dict-vectorization in python scikit-learn?

I am using logistic regression classifier to predict ethnicity class label 0, 1. My data is split into testing and training sample and got dict-vectorized into sparse matrix.
The following is the working codes, where I predict and validate X_train and X_test which are part of the features that got vectorized:
for i in mass[k]:
df = df_temp # reset df before each loop
#$$
if 1==1:
count+=1
ethnicity_tar = str(i)
############################################
############################################
def ethnicity_target(row):
try:
if row[ethnicity_var] == ethnicity_tar:
return 1
else:
return 0
except: return None
df['ethnicity_scan'] = df.apply(ethnicity_target, axis=1)
print '1=', ethnicity_tar
print '0=', 'non-'+ethnicity_tar
# Random sampling a smaller dataframe for debugging
rows = df.sample(n=subsample_size, random_state=seed) # Seed gives fixed randomness
df = DataFrame(rows)
print 'Class count:'
print df['ethnicity_scan'].value_counts()
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_full_last_name(nameString):
try:
last_name = nameString.rsplit(None, 1)[-1]
if len(last_name) > 1: # not accept name with only 1 character
return last_name
else: return '?'
except: return '?'
def feature_full_first_name(nameString):
try:
first_name = nameString.rsplit(' ', 1)[0]
if len(first_name) > 1: # not accept name with only 1 character
return first_name
else: return '?'
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict = [{'last-name': feature_full_last_name(i)} for i in X]
my_dict5 = [{'first-name': feature_full_first_name(i)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(
my_dict[i].items() + my_dict5[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
X_train, X_test, y_train, y_test = cross_validation.train_test_split(newX, y, test_size=testTrainSplit)
# Fitting X and y into model, using training data
classifierUsed2.fit(X_train, y_train)
# Making predictions using trained data
y_train_predictions = classifierUsed2.predict(X_train)
y_test_predictions = classifierUsed2.predict(X_test)
However, I would like to predict just a single name for example "John Carter" and predict the ethnicity label. I replaced the y_train_predictions = classifierUsed2.predict(X_train) and y_train_predictions = classifierUsed2.predict(X_train) with the following line but resulting in error:
print classifierUsed2.predict(["John Carter"])
#error
Error: X has 1 features per sample; expecting 103916
You need to transform your data in the exact same way as a training one, thus something like (if your input data was just list of strings)
classifierUsed2.predict(dv.transform(["John Carter"]))