Pyomo: What dataformats can I pass into .create_instance() - pyomo

I want to import parameters from 1 excel sheet, (in the future also 1 csv file) and some parameters that I want to set in the code.
I am importing this values using pandas. But than I donĀ“t know how to pass them to the instance. I tried various options but I am only guessing...
I saw variable examples but I am not able to understand and adopt them.
import pandas as pd
from pyomo.environ import *
from pyomo.opt import SolverFactory
from pyomo.core import Var
infinity = float('inf')
opt = SolverFactory('glpk') # GNU Linear Programming Kit for solving large-scale linear programming (LP), mixed integer programming (MIP), and other
df1 = pd.read_excel("datosPvaD.xlsx")
df2 = pd.read_excel("otrosDatos.xlsx")
#demand = consumption['Consumo (Wh)']
#demand.index += 1
#demand_list = demand.tolist()
data1 = df1.to_dict()
#data2 = df2.to_dict(orient='index')
#data2 = df2.to_dict()
"""
# is the same as otros datos
data2 = {None: {
'pRdImp': {None: 0.35},
'pRdExp': {None: 0.1},
'rend': {None: 0.9},
'CAB': {None: 0.082},
'CABasic': {None: 0.082},
'CAPV': {None: 0.224},
'CI': {None: 0.06849},
'M': {None: 1000},
'dt': {None: 1},
}}
"""
data2 = {'pRdImp': 0.35,
'pRdExp': 0.1,
'rend': 0.9,
'CAB': 0.08,
'CABasic': 0.082,
'CAPV': 0.224,
'CI': 0.06849,
'M': 1000,
'dt': 1
}
#z = {**x, **y}
data = {**data1, **data2}
#from Fotovoltaica_V2_csvread import model # import model
from Fotovoltaica_V1 import model # import model
#instance = model.create_instance('Fotovoltaica_V2.dat')
#instance = model.create_instance(data)
instance = model.create_instance(data1,'Fotovoltaica_V2.dat')

It's hard to tell without seeing your entire model, but the section you have commented out for data2 should work:
data2 = {
None:{
'param':{None:val},
...
}
}
I'm assuming that all of your parameters are not indexed. If they are indexed, then you would need something as follows:
model = AbstractModel()
model.t = Set()
model.thing = Param(t)
input_data = {
None:{
't':{None:[1, 2, 3]},
'thing':{1:100, 2:200, 3:300}
}
}
You would then create a model instance by calling
model.create_instance(input_data)
You can import the data from a csv into python as you normally would with pandas, but then there will be a little rework you need to do to get it in the correct pyomo format

Take a look at this example: https://github.com/Pyomo/pyomo/blob/master/examples/doc/pyomobook/overview-ch/wl_excel.py
I would suggest using a ConcreteModel instead of an AbstractModel when using Pandas to load the data. Instead of creating Param objects the dataframe can be used directly in the Constraint.

Related

I want to add TensorBoard to the code from pytorch.org

I'm pretty new to deep learning, but I want to add Tensorboard to the following code to track loss, accuracy, average precision and so on.
Sample code from the TorchVision -2.3 Object Detection Finetuning Tutorial
http://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
import os
import numpy as np
import torch
from PIL import Image
import sys
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from engine import train_one_epoch, evaluate
import utils
import transforms as T
#from torch.utils.tensorboard import SummaryWriter
#writer = SummaryWriter()
class PennFudanDataset(object):
def __init__(self, root, transforms):
self.root = root
self.transforms = transforms
# load all image files, sorting them to
# ensure that they are aligned
self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
def __getitem__(self, idx):
# load images and masks
img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
img = Image.open(img_path).convert("RGB")
# note that we haven't converted the mask to RGB,
# because each color corresponds to a different instance
# with 0 being background
mask = Image.open(mask_path)
mask = np.array(mask)
# instances are encoded as different colors
obj_ids = np.unique(mask)
# first id is the background, so remove it
obj_ids = obj_ids[1:]
# split the color-encoded mask into a set
# of binary masks
masks = mask == obj_ids[:, None, None]
# get bounding box coordinates for each mask
num_objs = len(obj_ids)
boxes = []
for i in range(num_objs):
pos = np.where(masks[i])
xmin = np.min(pos[1])
xmax = np.max(pos[1])
ymin = np.min(pos[0])
ymax = np.max(pos[0])
boxes.append([xmin, ymin, xmax, ymax])
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor(masks, dtype=torch.uint8)
image_id = torch.tensor([idx])
area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# suppose all instances are not crowd
iscrowd = torch.zeros((num_objs,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks
target["image_id"] = image_id
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.imgs)
def get_model_instance_segmentation(num_classes):
# load an instance segmentation model pre-trained pre-trained on COCO
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
# now get the number of input features for the mask classifier
in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
num_classes)
return model
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def main():
# train on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# our dataset has two classes only - background and person
num_classes = 2
# use our dataset and defined transformations
dataset = PennFudanDataset('PennFudanPed', get_transform(train=True))
dataset_test = PennFudanDataset('PennFudanPed', get_transform(train=False))
# split the dataset in train and test set
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
dataset_test = torch.utils.data.Subset(dataset_test, indices[-50:])
# define training and validation data loaders
data_loader = torch.utils.data.DataLoader(
dataset, batch_size=4, shuffle=True, num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=2, shuffle=False, num_workers=4,
collate_fn=utils.collate_fn)
# get the model using our helper function
model = get_model_instance_segmentation(num_classes)
# move model to the right device
model.to(device)
# construct an optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
# and a learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
step_size=3,
gamma=0.1)
# let's train it for 10 epochs
num_epochs = 10
for epoch in range(num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)
# update the learning rate
lr_scheduler.step()
# evaluate on the test dataset
evaluate(model, data_loader_test, device=device)
print("That's it!")
if __name__ == "__main__":
main()

Performance difference serializing pandas frames python 2.x / 3.x

I was experiencing some performance differences between python 2.7 and 3.5 when serializing pandas frames to CSV.
So did a quick search on google and found this benchmark:
https://gist.github.com/GitRay/4001b4962eb9f3e09a9d456ee5a30aae
And modified it a bit for my needs:
import pandas as pd
from time import time
import platform
def timeit(func, n=5):
start = time()
for i in range(n):
func()
end = time()
return (end - start) / n
def csvdumps(s):
s.to_csv('foo')
return 'foo'
def csvloads(fn):
return pd.read_csv(fn)
def hdfdumps(s):
s.to_hdf('foo', 'bar', mode='w')
return ('foo', 'bar')
def hdfloads(path):
return pd.read_hdf('foo', 'bar')
df = pd.DataFrame({'text': [str(i % 1000) for i in range(1000000)],
'numbers': range(1000000)})
keys = ['csv', 'hdfstore']
d = {'csv': [csvloads, csvdumps],
'hdfstore': [hdfloads, hdfdumps]}
result = dict()
for name, (loads, dumps) in d.items():
text = dumps(df.text)
numbers = dumps(df.numbers)
result[name] = {'text': {'dumps': timeit(lambda: dumps(df.text)),
'loads': timeit(lambda: loads(text))},
'numbers': {'dumps': timeit(lambda: dumps(df.numbers)),
'loads': timeit(lambda: loads(numbers))}}
########
# Plot #
########
# Much of this was taken from
# http://nbviewer.ipython.org/gist/mwaskom/886b4e5cb55fed35213d
# by Michael Waskom
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", font_scale=1.3)
w, h = 7, 7
f, (left, right) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(w*2, h), squeeze=True)
df = pd.DataFrame({'loads': [result[key]['text']['loads'] for key in keys],
'dumps': [result[key]['text']['dumps'] for key in keys],
'storage': keys})
df = pd.melt(df, "storage", value_name="duration", var_name="operation")
sns.barplot("duration", "storage", "operation", data=df, ax=left)
left.set(xlabel="Duration (s)", ylabel="")
sns.despine(bottom=True)
left.set_title('Cost to Serialize Text')
left.legend(loc="lower center", ncol=2, frameon=True, title="operation")
df = pd.DataFrame({'loads': [result[key]['numbers']['loads'] for key in keys],
'dumps': [result[key]['numbers']['dumps'] for key in keys],
'storage': keys})
df = pd.melt(df, "storage", value_name="duration", var_name="operation")
sns.barplot("duration", "storage", "operation", data=df, ax=right)
right.set(xlabel="Duration (s)", ylabel="")
sns.despine(bottom=True)
right.set_title('Cost to Serialize Numerical Data')
right.legend(loc="lower center", ncol=2, frameon=True, title="operation")
plt.savefig('serialize_py'+'.'.join(platform.python_version_tuple())+'.png')
As you can see in the results serializing in python 3 is much slower :
python 2.7 python 3.5 diff
load 0.3504s 0.329005s +06.50%
dump 1.2784s 3.333152s -61.65%
Does anybody know why?

Python KeyError: 1.0

I'm trying to run this code
from math import sqrt
import numpy as np
import warnings
from collections import Counter
import pandas as pd
import random
def k_nearest_neighbors(data,predict, k =3):
if len(data) >= k:
warnings.warn('K is set to a value less than total voting groups')
distances = []
for group in data:
for features in data[group]:
eucliden_distance = np.linalg.norm(np.array(features)-np.array(predict))
distances.append([eucliden_distance,group])
votes = [i[1] for i in sorted(distances)[:k]]
print(Counter(votes).most_common(1))
vote_result = Counter(votes).most_common(1)[0][0]
return vote_result
df = pd.read_csv('bc2.txt')
df.replace('?',-99999,inplace=True)
df.drop(['id'],1,inplace = True)
full_data = df.astype(float).values.tolist()
random.shuffle(full_data)
test_size = 0.2
train_set = {2:[],4:[]}
test_set = {2:[],4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]
for i in train_data:
train_set[i[-1]].append(i[:-1])
for i in train_data:
test_set[i[-1]].append(i[:-1])
correct = 0
total = 0
for group in test_set:
for data in test_set[group]:
vote = k_nearest_neighbors(train_set,data, k=5)
if group == vote:
correct += 1
total += 1
print ('Accuracy:',correct/total)
it comes out with this error msg
File "ml8.py", line 38, in <module>
train_set[i[-1]].append(i[:-1])
KeyError: 1.0
file m18.py is this above code file
below is the sample of txt file
id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
1000025,2,5,1,1,1,2,1,3,1,1
1002945,2,5,4,4,5,7,10,3,2,1
1015425,2,3,1,1,1,2,2,3,1,1
1016277,2,6,8,8,1,3,4,3,7,1
1017023,2,4,1,1,3,2,1,3,1,1
1017122,4,8,10,10,8,7,10,9,7,1
1018099,2,1,1,1,1,2,10,3,1,1
1018561,2,2,1,2,1,2,1,3,1,1
1033078,2,2,1,1,1,2,1,1,1,5
1033078,2,4,2,1,1,2,1,2,1,1
1035283,2,1,1,1,1,1,1,3,1,1
1036172,2,2,1,1,1,2,1,2,1,1
1041801,4,5,3,3,3,2,3,4,4,1
I'm using 2.7.11 version
Your train_set only contains keys 2 and 4, whereas your classes in that sample are 1 and 5.
Instead of using
train_set = {2:[],4:[]}
you might have better luck with defaultdict:
from collections import defaultdict
train_set = defaultdict(list)
This way a non-existent key will be initialized to a new empty list on first access.

How to improve my feature selection for a NB classifier?

I have read that improving feature selection will reduce the training time of my classifier and also improve accuracy but I am not sure how can I reduce the number of features. Should I count them and after select the first 3000 for example ?
This is my code :
def save_object(obj, filename):
with open(filename, 'wb') as output:
pickle.dump(obj,output,pickle.HIGHEST_PROTOCOL)
print "saved"
ujson.dumps({"output" : "obj"})
with open('neg5000.csv','rb') as f:
reader = csv.reader(f)
neg_tweets = list(reader)
print "list ready"
with open('pos5000.csv','rb') as f:
reader = csv.reader(f)
pos_tweets = list(reader)
print "list ready"
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = list(wordlist.keys())[:3000]
#word_features = wordlist.keys()
return word_features
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#def extract_features(words):
# return dict([(word, True) for word in words])
word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
save_object(word_features, 'wordf.save')
print 'features done'
print datetime.datetime.now()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'training done'
print datetime.datetime.now()
save_object(classifier, 'classifier.save')
tweet = 'I love this car'
print classifier.classify(extract_features(tweet.split()))
There's a number of ways to approach feature selection for the supervised classification problem (which is what Naive Bayes does). I suggest heading over to scikit-learn manual and just trying everything listed there, since the choice of particular method is dependends on the data you have.
The easiest way to do this is to switch to the scikit-learn implementation of Naive Bayes and the use a Pipeline to chain the feature selection and classifier training. See this tutorial for code examples.
Here's a version of your code using scikit-learn with SelectKBest feature selection:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
def read_input(path):
with open(path) as handle:
lines = (line.rsplit(",", 1) for line in handle)
return [text for text, label in lines]
# Assuming each line in ``neg5000.csv`` and ``pos5000.csv`` is a
# UTF-8-encoded tweet.
neg_tweets = read_input("neg5000.csv")
pos_tweets = read_input("pos5000.csv")
X = np.append(neg_tweets, pos_tweets)
y = np.append(np.full(len(neg_tweets), -1, dtype=int),
np.full(len(pos_tweets), 1, dtype=int))
p = Pipeline([
("vectorizer", CountVectorizer()),
("selector", SelectPercentile(percentile=20)),
("nb", MultinomialNB())
])
p.fit(X, y)
print(p.predict(["I love this car"]))

Feature selection in scikit learn for multiple variables and thousands+ features

I am trying to perform feature selection for logistic regression classifier. Originally there are 4 variables: name, location, gender, and label = ethnicity. The three variables, namely the name, give rise to tens of thousands of more "features", for example, name "John Snow" will give rise to 2-letter substrings like 'jo', 'oh', 'hn'... etc. The feature set undergoes DictVectorization.
I am trying to follow this tutorial (http://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html) but I am not sure if I am doing it right since the tutorial is using a small number of features while mine has tens of thousands after vectorization. And also the plt.show() shows a blank figure.
# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import re
import random
import time
from random import randint
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
# Assign X and y variables
X = df.raw_name.values
X2 = df.name.values
X3 = df.gender.values
X4 = df.location.values
y = df.ethnicity_scan.values
# Feature extraction functions
def feature_full_name(nameString):
try:
full_name = nameString
if len(full_name) > 1: # not accept name with only 1 character
return full_name
else: return '?'
except: return '?'
def feature_avg_wordLength(nameString):
try:
space = 0
for i in nameString:
if i == ' ':
space += 1
length = float(len(nameString) - space)
name_entity = float(space + 1)
avg = round(float(length/name_entity), 0)
return avg
except:
return 0
def feature_name_entity(nameString2):
space = 0
try:
for i in nameString2:
if i == ' ':
space += 1
return space+1
except: return 0
def feature_gender(genString):
try:
gender = genString
if len(gender) >= 1:
return gender
else: return '?'
except: return '?'
def feature_noNeighborLoc(locString):
try:
x = re.sub(r'^[^, ]*', '', locString) # remove everything before and include first ','
y = x[2:] # remove subsequent ',' and ' '
return y
except: return '?'
def list_to_dict(substring_list):
try:
substring_dict = {}
for i in substring_list:
substring_dict['substring='+str(i)] = True
return substring_dict
except: return '?'
# Transform format of X variables, and spit out a numpy array for all features
my_dict13 = [{'name-entity': feature_name_entity(feature_full_name(i))} for i in X2]
my_dict14 = [{'avg-length': feature_avg_wordLength(feature_full_name(i))} for i in X]
my_dict15 = [{'gender': feature_full_name(i)} for i in X3]
my_dict16 = [{'location': feature_noNeighborLoc(feature_full_name(i))} for i in X4]
my_dict17 = [{'dummy1': 1} for i in X]
my_dict18 = [{'dummy2': random.randint(0,2)} for i in X]
all_dict = []
for i in range(0, len(my_dict)):
temp_dict = dict(my_dict13[i].items() + my_dict14[i].items()
+ my_dict15[i].items() + my_dict16[i].items() + my_dict17[i].items() + my_dict18[i].items()
)
all_dict.append(temp_dict)
newX = dv.fit_transform(all_dict)
# Separate the training and testing data sets
half_cut = int(len(df)/2.0)*-1
X_train = newX[:half_cut]
X_test = newX[half_cut:]
y_train = y[:half_cut]
y_test = y[half_cut:]
# Fitting X and y into model, using training data
lr = LogisticRegression()
lr.fit(X_train, y_train)
dv = DictVectorizer()
# Feature selection
plt.figure(1)
plt.clf()
X_indices = np.arange(X_train.shape[-1])
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X_train, y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45, scores, width=.2,
label=r'Univariate score ($-Log(p_{value})$)', color='g')
plt.show()
Warning:
E:\Program Files Extra\Python27\lib\site-packages\sklearn\feature_selection\univariate_selection.py:111: UserWarning: Features [[0 0 0 ..., 0 0 0]] are constant.
It looks like the way you split your data into training and testing sets is not working:
# Separate the training and testing data sets
X_train = newX[:half_cut]
X_test = newX[half_cut:]
If you already use sklearn, it is much more convenient to use the builtin splitting routine for this:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5, random_state=0)