use global variables in AWS Sagemaker script - amazon-web-services

After having correctly deployed our model, I need to invoke it via lambda function. The script features two cleaning function, the first one (cleaning()) gives us 5 variables: the cleaned dataset and 4 other variables (scaler, monthdummies, compadummies, parceldummies) that we need to use in the second cleaning function (cleaning_test()).
The reason behind this is that in the use case I'll have only one instance at a time to perform predictions on, not an entire dataset. This means that I pass the row to the first cleaning() function since some commands won't work. I can't also use a scaler and neither create dummy variables, so the aim is to import the scaler and some dummies used in the cleaning() function, since they come from the whole dataset, that I used to train the model.
Hence, in the input_fn() function, the input needs to be cleaned using the cleaning_test() function, that requires the scaler and the three lists of dummies from the cleaning() one.
When I train the model, the cleaning() function works fine, but after the deployment, if we invoke the endpoint, it raises the error that variable "scaler" is not defined.
Below is the script.py:
Note that the test is # since I've already tested it, so now I'm training on the whole dataset and I want to predict completely new instances
def cleaning(data):
some cleaning on data stored in s3
return cleaned_data, scaler, monthdummies, compadummies, parceldummies
def cleaning_test(data, scaler, monthdummies, compadummies, parceldummies):
cleaning on data without labels
return cleaned_data
def model_fn(model_dir):
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
def input_fn(request_body, request_content_type):
if request_content_type == "application/json":
data = json.loads(request_body)
df = pd.DataFrame(data, index = [0])
input_data = cleaning_test(df, scaler, monthdummies, compadummies, parceldummies)
else:
pass
return input_data
def predict_fn(input_data, model):
return model.predict_proba(input_data)
if __name__ =='__main__':
print('extracting arguments')
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument('--n_estimators', type=int, default=10)
parser.add_argument('--min-samples-leaf', type=int, default=3)
# Data, model, and output directories
parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
#parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
parser.add_argument('--train-file', type=str, default='fp_train.csv')
#parser.add_argument('--test-file', type=str, default='fp_test.csv')
args, _ = parser.parse_known_args()
print('reading data')
train_df = pd.read_csv(os.path.join(args.train, args.train_file))
#test_df = pd.read_csv(os.path.join(args.test, args.test_file))
print("cleaning")
train_df, scaler, monthdummies, compadummies, parceldummies = cleaning(train_df)
#test_df, scaler1, monthdummies1, compadummies1, parceldummies1 = cleaning(test_df)
print("splitting")
y = train_df.loc[:,"event"]
X = train_df.loc[:, train_df.columns != 'event']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
"""print('building training and testing datasets')
X_train = train_df.loc[:, train_df.columns != 'event']
X_test = test_df.loc[:, test_df.columns != 'event']
y_train = train_df.loc[:,"event"]
y_test = test_df.loc[:,"event"]"""
print(X_train.columns)
print(X_test.columns)
# train
print('training model')
model = RandomForestClassifier(
n_estimators=args.n_estimators,
min_samples_leaf=args.min_samples_leaf,
n_jobs=-1)
model.fit(X_train, y_train)
# print abs error
print('validating model')
proba = model.predict_proba(X_test)
# persist model
path = os.path.join(args.model_dir, "model.joblib")
joblib.dump(model, path)
print('model persisted at ' + path)
That I run through:
sklearn_estimator = SKLearn(
entry_point='script.py',
role = get_execution_role(),
train_instance_count=1,
train_instance_type='ml.c5.xlarge',
framework_version='0.20.0',
base_job_name='rf-scikit',
hyperparameters = {'n_estimators': 15})
sklearn_estimator.fit({'train':trainpath})
sklearn_estimator.latest_training_job.wait(logs='None')
artifact = sm_boto3.describe_training_job(
TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']
predictor = sklearn_estimator.deploy(
instance_type='ml.c5.large',
initial_instance_count=1)
The question is, how can I "store" the variables given by the cleaning() function during the training process, in order to use them in the input_fn() function, making cleaning_test() work fine?
Thanks!

Related

Getting a 'ValueError: 2 many values to unpack' from a method that lists only one required arg. I'm not sure of the best way to unpack it

I am calling a method from a predefined Class (L2Interface) from the acitoolkit module that lists only one required argument. The method returns two strings 'encap-type' and 'encap-id'. I am floundering with the best way to unpack these values. Here is my script. The method in question is: 'vlans = aci.L2Interface.parse_encap(encap)'
import sys
import acitoolkit.acitoolkit as aci
import requests
import re
def init(self, name, encap_type, encap_id, encap_mode=None):
self.name = None
self.encap_type = VLAN
self.encap_id = None
def main():
"""
Main Show Endpoints Routine
:return: None
"""
# Take login credentials from the command line if provided
# Otherwise, take them from your environment variables file ~/.profile
description = ('Simple application that logs on to the APIC'
' and displays all of the Endpoints.')
creds = aci.Credentials('apic', description)
args = creds.get()
# Login to APIC
session = aci.Session(args.url, args.login, args.password, verify_ssl=False)
resp = session.login()
if not resp.ok:
print('%% Could not login to APIC')
sys.exit(0)
# Get encap per interface
# and store the data as tuples in a List
data = []
encap = 'vlan-[0-9].*'
#vxtype = 'vxlan\-[0-9|a-z].*'
vlans = aci.L2Interface.parse_encap(encap)
for vlan in vlans:
data.append((vlan.attributes['encap_type'],
vlan.attributes['encap_id']))
# Display the data downloaded
col_widths = [19, 17, 15, 15, 15]
template = ''
for idx, width in enumerate(col_widths):
template += '{%s:%s} ' % (idx, width)
print(template.format("ENDCAP_TYPE", "ENCAP_ID"))
fmt_string = []
for i in range(0, len(col_widths)):
fmt_string.append('-' * (col_widths[i] - 2))
print(template.format(*fmt_string))
for rec in data:
print(template.format(*rec))
if name == 'main':
try:
main()
except KeyboardInterrupt:
pass
I am trying to connect to an APIC, grab L2 interfaces with encapsulation (encap) assigned and return them in a list.

AttributeError: Can't pickle local object 'train.<locals>.create_model'

I am trying to use my own ML models for creating trainings job in aws Sagemaker. When I start training process everything goes well but at the end it says that "AttributeError: Can't pickle local object 'train..create_model'". I am new into this job. I did the same things for mlp, knn, cart, and svr but never encountered with that issue. I know that lstm uses too much different things to create model but I can not figure out how to solve that issue.
Here is my train.py file where I get the error:
from __future__ import print_function
import json
import os
import pickle
import sys
import traceback
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# These are the paths to where SageMaker mounts interesting things in your container.
prefix = "/opt/ml/"
input_path = prefix + "input/data"
output_path = os.path.join(prefix, "output")
model_path = os.path.join(prefix, "model")
# This algorithm has a single channel of input data called 'training'. Since we run in
# File mode, the input files are copied to the directory specified here.
channel_name = "training"
training_path = os.path.join(input_path, channel_name)
# The function to execute training.
def train():
print("Starting the training")
print(training_path)
try:
# Take the set of files and read them all into a single pandas dataframe
input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
if len(input_files) == 0:
raise ValueError(('There are no files in {}.\n' +
'This usually indicates that the channel ({}) was incorrectly specified,\n' +
'the data specification in S3 was incorrectly specified or the role specified\n' +
'does not have permission to access the data.').format(training_path, channel_name))
raw_data = [ pd.read_csv(file, header=0, index_col=0) for file in input_files ]
data = pd.concat(raw_data)
print(data)
# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
values = data.values
# ensure all data is float
values = values.astype('float32')
# normalize features
scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)
# specify the number of lag time steps
n_timesteps = 3
n_features = 4
# frame as supervised learning
reframed = series_to_supervised(scaled, n_timesteps, 1)
print(reframed.shape)
# drop columns we don't want to predict
reframed.drop(reframed.columns[[4,9,14,15,16,17,18]], axis=1, inplace=True)
print(reframed.head())
# split into train and test sets
values = reframed.values
n_train_size = 403
train = values[:n_train_size, :]
test = values[n_train_size:, :]
# split into input and outputs
n_obs = n_timesteps * n_features
train_X, train_y = train[:, :n_obs], train[:, -1]
test_X, test_y = test[:, :n_obs], test[:, -1]
print(train_X.shape, len(train_X), train_y.shape)
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_timesteps, n_features))
test_X = test_X.reshape((test_X.shape[0], n_timesteps, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
# Function to create model
def create_model():
# create model
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
# Compile model
# optimizer = SGD(lr=learn_rate, momentum=momentum)
model.compile(loss='mae',optimizer='adam')
return model
from scikeras.wrappers import KerasRegressor
# create model
model = KerasRegressor(model=create_model, verbose=0)
from sklearn.model_selection import GridSearchCV
# define the grid search parameters
batch_size = [2,4,8,16,32]
epochs = [10, 50, 100]
#learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
#momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_X, train_y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
# save the model
with open(os.path.join(model_path, "snop-lstm.pkl"), "wb") as out:
pickle.dump(grid, out)
print("Training complete.")
except Exception as e:
# Write out an error file. This will be returned as the failureReason in the
# DescribeTrainingJob result.
trc = traceback.format_exc()
with open(os.path.join(output_path, "failure"), "w") as s:
s.write("Exception during training: " + str(e) + "\n" + trc)
# Printing this causes the exception to be in the training job logs, as well.
print("Exception during training: " + str(e) + "\n" + trc, file=sys.stderr)
# A non-zero exit code causes the training job to be marked as Failed.
sys.exit(255)
if __name__ == "__main__":
train()
# A zero exit code causes the job to be marked a Succeeded.
sys.exit(0)
And this is the log:
2022-02-25T10:28:16.751+03:00
Exception during training: Can't pickle local object 'train.<locals>.create_model'
Exception during training: Can't pickle local object 'train.<locals>.create_model'
2022-02-25T10:28:16.751+03:00
Traceback (most recent call last):
File "/opt/program/train", line 154, in train
pickle.dump(grid, out)
Traceback (most recent call last): File "/opt/program/train", line 154, in train pickle.dump(grid, out)
2022-02-25T10:28:16.751+03:00
AttributeError: Can't pickle local object 'train.<locals>.create_model'
AttributeError: Can't pickle local object 'train.<locals>.create_model'
It seems that you are trying to pickle an object of class GridSearchCV instead of the model itself:
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
...
...
pickle.dump(grid, out)
I think what you want instead is to retrieve the best model (via best_model_, see here: https://github.com/scikit-learn/scikit-learn/blob/37ac6788c/sklearn/model_selection/_search.py#L1247) and then pickle that model

Unable to lunch Multiple Streaming Pipeline ( N to N Pipeline) Dynamically (Using Runtime Value Provider) in Single Dataflow Job in Python

I am trying to launch a Streaming Dataflow Job which contains n number of pipelines.
Based on configured topic and corresponding BQ table for each Topic i want to launch a Pipeline inside a one Streaming Job.
My actual problem is i have to create and upload a template for each and every project. What i want is, i can reuse the uploaded template and only configuration files ihave to pass for launching new dataflow job by changing topic,subscription, dataset and bq table.
Which is i am unable to reuse the template.
Please help me on this and let me know if this is possible or not. Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).
import logging
import os
import json
from google.cloud import storage
from apache_beam import Pipeline, ParDo, DoFn
from apache_beam.io import ReadFromPubSub, WriteToBigQuery, BigQueryDisposition
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, WorkerOptions, GoogleCloudOptions, \
SetupOptions
def _get_storage_service():
storage_client = storage.Client \
.from_service_account_json(
json_credentials_path='C:\Users\dneema\PycharmProjects\iot_dataflow\df_stm_iot_pubsub_bq\service_account_credentials.json')
print('storage service fetched')
return storage_client
class RuntimeOptions(PipelineOptions):
def __init__(self, flags=None, **kwargs):
super(RuntimeOptions, self).__init__(flags, **kwargs)
#classmethod
def _add_argparse_args(cls, parser):
parser.add_value_provider_argument('--bucket_name', type=str)
parser.add_value_provider_argument('--config_json_path', type=str,)
class PipelineCreator:
def __init__(self):
self.options = PipelineOptions()
storage_client = storage.Client.from_service_account_json(
'service_account_credentials_updated.json')
runtime_options = self.options.view_as(RuntimeOptions)
bucket_name = str(runtime_options.bucket_name)
config_json_path = str(runtime_options.config_json_path)
# get the bucket with name
bucket = storage_client.get_bucket(bucket_name)
# get bucket file as blob
blob = bucket.get_blob(config_json_path)
# convert to string and load config
json_data = blob.download_as_string()
self.configData = json.loads(json_data)
dataflow_config = self.configData['dataflow_config']
self.options.view_as(StandardOptions).streaming = bool(dataflow_config['streaming'])
self.options.view_as(SetupOptions).save_main_session = True
worker_options = self.options.view_as(WorkerOptions)
worker_options.max_num_workers = int(dataflow_config['max_num_worker'])
worker_options.autoscaling_algorithm = str(dataflow_config['autoscaling_algorithm'])
#worker_options.machine_type = str(dataflow_config['machine_type'])
#worker_options.zone = str(dataflow_config['zone'])
#worker_options.network = str(dataflow_config['network'])
#worker_options.subnetwork = str(dataflow_config['subnetwork'])
def run(self):
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'dataflow-service-account.json'
project_id = self.configData['project_id']
dataset_id = self.configData['dataset_id']
topics = self.configData['topics']
table_ids = self.configData['bq_table_ids']
error_table_id = self.configData['error_table_id']
logger = logging.getLogger(project_id)
logger.info(self.options.display_data())
pipeline = Pipeline(options=self.options)
size = len(topics)
for index in range(size):
print(topics[index])
pipeline_name = "pipeline_"+str(index)
logger.info("Launch pipeline :: "+pipeline_name)
messages = pipeline | 'Read PubSub Message in ' + pipeline_name >> ReadFromPubSub(topic=topics[index])
logger.info("Read PubSub Message")
valid_messages, invalid_messages = messages | 'Convert Messages to TableRows in ' + pipeline_name >> ParDo(TransformMessageToTableRow()).with_outputs('invalid', main='valid')
valid_messages | 'Write Messages to BigQuery in ' + pipeline_name >> WriteToBigQuery(table=table_ids[index],
dataset=dataset_id,
project=project_id,
write_disposition=BigQueryDisposition.WRITE_APPEND)
pipeline.run().wait_until_finish()
class TransformMessageToTableRow(DoFn):
def process(self, element, *args, **kwargs):
logging.getLogger('dataflow').log(logging.INFO, element)
print element
print("element type ", type(element))
print("inside bq pardo")
import json
try:
message_rows = json.loads(element)
# if using emulator, uncomment below line
message_rows = json.loads(message_rows)
print 'loaded element'
except:
try:
element = "[" + element + "]"
message_rows = json.loads(element)
except Exception as e:
print(e)
from apache_beam import pvalue
yield [pvalue.TaggedOutput('invalid', [element, str(e)])]
print(message_rows)
print("message rows", type(message_rows))
if not isinstance(message_rows, list):
message_rows = [message_rows]
#rows = list()
if isinstance(message_rows, list):
for row in message_rows:
try:
new_row = dict()
for k, v in row.items():
new_row[str(k)] = v
#rows.append(new_row)
print(new_row)
yield new_row
except Exception as e:
print(e)
from apache_beam import pvalue
yield pvalue.TaggedOutput('invalid', [row, str(e)])
if __name__ == '__main__':
PipelineCreator().run()
Here Runtime argument as bucket_name and config_json_path for all the configuration related stuffs like Dataset, BQ table, Topics/ Subscription and all Workflow options.
This is possible or not ? Because Google has also provided one to one template. Not many to many Template (e.g Three topic - Three BQ Table (three data pipeleine) , n-n).
Regarding this previously answered thread Unable to run multiple Pipelines in desired order by creating template in Apache Beam, you can run only one pipeline inside a template at any time.
You'll have to delegate the template creation to another service and pass the configuration with it, just follow the link inside the thread and you'll have How To examples.

How to classify image in real time using tensorflow?

I'm trying to use raspberry pi camera to capture image and classify the image in real time into three classes. What I did is using the code below. It can predict in the first iteration. The problem is that it shows me ran out of memory after the second iteration. Is there anyway to fix this?
import numpy as np
import tensorflow as tf
import argparse
import os
import sys
def create_graph(model_file):
"""Creates a graph from saved GraphDef file and returns a saver."""
# Creates graph from saved graph_def.pb.
with tf.gfile.FastGFile(model_file, 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
_ = tf.import_graph_def(graph_def, name='')
def run_inference(images, out_file, labels, model_file, k=5):
# Creates graph from saved GraphDef.
create_graph(model_file)
if out_file:
out_file = open(out_file, 'wb', 1)
with tf.Session() as sess:
softmax_tensor = sess.graph.get_tensor_by_name('final_result:0')
for img in images:
if not tf.gfile.Exists(img):
tf.logging.fatal('File does not exist %s', img)
continue
image_data = tf.gfile.FastGFile(img, 'rb').read()
predictions = sess.run(softmax_tensor,
{'DecodeJpeg/contents:0': image_data})
predictions = np.squeeze(predictions)
top_k = predictions.argsort()[-k:][::-1] # Getting top k predictions
vals = []
for node_id in top_k:
human_string = labels[node_id]
score = predictions[node_id]
vals.append('%s=%.5f' % (human_string, score))
rec = "%s\t %s" % (img, ", ".join(vals))
if out_file:
out_file.write(rec)
out_file.write("\n")
else:
print(rec)
if out_file:
print("Output stored to a file")
out_file.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Classify Image(s)')
parser.add_argument('-i','--in', help='Input Image file ')
parser.add_argument('-li','--list', help='List File having input image paths')
parser.add_argument('-o','--out', help='Output file for storing the content')
parser.add_argument('-m','--model', help='model file path (protobuf)', required=True)
parser.add_argument('-l','--labels', help='labels text file', required=True)
parser.add_argument('-r','--root', help='path to root directory of input data')
args = vars(parser.parse_args())
# Read input
if not args['in'] and not args['list']:
print("Either -in or -list option is required.")
sys.exit(1)
if args['in']:
images = [args['in']]
else: # list must be given
with open(args['list']) as ff:
images = filter(lambda x: x, map(lambda y: y.strip(), ff.readlines()))
# if a separate root directory given then make a new path
if args['root']:
print("Input data from : %s" % args['root'])
images = map(lambda p: os.path.join(args['root'], p), images)
with open(args['labels'], 'rb') as f:
labels = [str(w).replace("\n", "") for w in f.readlines()]
while True:
imagename='/home/pi/Desktop/camerasnap.jpg'
images=raspi.capture(imagename)
run_inference(images=images, out_file=args['out'], labels=labels, model_file=args['model'])
The problem is that you are creating the graph in every run_inference method call:
while True:
imagename='/home/pi/Desktop/camerasnap.jpg'
images=raspi.capture(imagename)
run_inference(images=images, out_file=args['out'], labels=labels, model_file=args['model'])
def run_inference(images, out_file, labels, model_file, k=5):
# Creates graph from saved GraphDef.
create_graph(model_file)
...
As the graph probably uses almost all memory in your GPU it fails in the second iteration when a the code tries to create a new graph. You should create only one graph for all the program life.
Try this:
create_graph(model_file)
while True:
imagename='/home/pi/Desktop/camerasnap.jpg'
images=raspi.capture(imagename)
run_inference(images=images, out_file=args['out'], labels=labels, model_file=args['model'])

GeoDJango: retrieve last inserted primary key from LayerMapping

I am building an application with GeoDjango and I have the following problem:
I need to read track data from a GPX file and those data should be stored in a model MultiLineStringField field.
This should happen in the admin interface, where the user uploads a GPX file
I am trying to achieve this, namely that the data grabbed from the file should be assigned to the MultiLineStringField, while the other fields should get values from the form.
My model is:
class GPXTrack(models.Model):
nome = models.CharField("Nome", blank = False, max_length = 255)
slug = models.SlugField("Slug", blank = True)
# sport natura arte/cultura
tipo = models.CharField("Tipologia", blank = False, max_length = 2, choices=TIPOLOGIA_CHOICES)
descrizione = models.TextField("Descrizione", blank = True)
gpx_file = models.FileField(upload_to = 'uploads/gpx/')
track = models.MultiLineStringField(blank = True)
objects = models.GeoManager()
published = models.BooleanField("Pubblicato")
rel_files = generic.GenericRelation(MyFiles)
#publish_on = models.DateTimeField("Pubblicare il", auto_now_add = True)
created = models.DateTimeField("Created", auto_now_add = True)
updated = models.DateTimeField("Updated", auto_now = True)
class Meta:
#verbose_name = "struttura'"
#verbose_name_plural = "strutture"
ordering = ['-created']
def __str__(self):
return str(self.nome)
def __unicode__(self):
return '%s' % (self.nome)
def put(self):
self.slug = sluggy(self.nome)
key = super(Foresta, self).put()
# do something after save
return key
While in the admin.py file I have overwritten the save method as follows:
from django.contrib.gis import admin
from trails.models import GPXPoint, GPXTrack
from django.contrib.contenttypes import generic
from django.contrib.gis.gdal import DataSource
#from gpx_mapping import GPXMapping
from django.contrib.gis.utils import LayerMapping
from django.template import RequestContext
import tempfile
import os
import pprint
class GPXTrackAdmin(admin.OSMGeoAdmin):
list_filter = ( 'tipo', 'published')
search_fields = ['nome']
list_display = ('nome', 'tipo', 'published', 'gpx_file')
inlines = [TrackImagesInline, TrackFilesInline]
prepopulated_fields = {"slug": ("nome",)}
def save_model(self, request, obj, form, change):
"""When creating a new object, set the creator field.
"""
if 'gpx_file' in request.FILES:
# Get
gpxFile = request.FILES['gpx_file']
# Save
targetPath = tempfile.mkstemp()[1]
destination = open(targetPath, 'wt')
for chunk in gpxFile.chunks():
destination.write(chunk)
destination.close()
#define fields of interest for LayerMapping
track_point_mapping = {'timestamp' : 'time',
'point' : 'POINT',
}
track_mapping = {'track' : 'MULTILINESTRING'}
gpx_file = DataSource(targetPath)
mytrack = LayerMapping(GPXTrack, gpx_file, track_mapping, layer='tracks')
mytrack.save()
#remove the temp file saved
os.remove(targetPath)
orig = GPXTrack.objects.get(pk=mytrack.pk)
#assign the parsed values from LayerMapping to the appropriate Field
obj.track = orig.track
obj.save()
As far as I know:
LayerMapping cannot be used to update a field but only to save a new one
I cannot access a specific field of the LayerMapping object (ie in the code above: mytrack.track) and assign its value to a model field (ie obj.track) in the model_save method
I cannot retrieve the primary key of the last saved LayerMapping object (ie in the code above: mytrack.pk) in order to update it with the values passed in the form for the field not mapped in LayerMapping.mapping
What can I do then?!?!
I sorted it out subclassing LayerMapping and adding a method get_values() that instead of saving the retrieved data, returns them for any use or manipulation.The get_values method is a copy of the LayerMapping::save() method that returns the values instead of saving them.
I am using django 1.5
import os
from django.contrib.gis.utils import LayerMapping
import sys
class MyMapping(LayerMapping):
def get_values(self, verbose=False, fid_range=False, step=False,
progress=False, silent=False, stream=sys.stdout, strict=False):
"""
Returns the contents from the OGR DataSource Layer
according to the mapping dictionary given at initialization.
Keyword Parameters:
verbose:
If set, information will be printed subsequent to each model save
executed on the database.
fid_range:
May be set with a slice or tuple of (begin, end) feature ID's to map
from the data source. In other words, this keyword enables the user
to selectively import a subset range of features in the geographic
data source.
step:
If set with an integer, transactions will occur at every step
interval. For example, if step=1000, a commit would occur after
the 1,000th feature, the 2,000th feature etc.
progress:
When this keyword is set, status information will be printed giving
the number of features processed and sucessfully saved. By default,
progress information will pe printed every 1000 features processed,
however, this default may be overridden by setting this keyword with an
integer for the desired interval.
stream:
Status information will be written to this file handle. Defaults to
using `sys.stdout`, but any object with a `write` method is supported.
silent:
By default, non-fatal error notifications are printed to stdout, but
this keyword may be set to disable these notifications.
strict:
Execution of the model mapping will cease upon the first error
encountered. The default behavior is to attempt to continue.
"""
# Getting the default Feature ID range.
default_range = self.check_fid_range(fid_range)
# Setting the progress interval, if requested.
if progress:
if progress is True or not isinstance(progress, int):
progress_interval = 1000
else:
progress_interval = progress
# Defining the 'real' save method, utilizing the transaction
# decorator created during initialization.
#self.transaction_decorator
def _get_values(feat_range=default_range, num_feat=0, num_saved=0):
if feat_range:
layer_iter = self.layer[feat_range]
else:
layer_iter = self.layer
for feat in layer_iter:
num_feat += 1
# Getting the keyword arguments
try:
kwargs = self.feature_kwargs(feat)
except LayerMapError, msg:
# Something borked the validation
if strict: raise
elif not silent:
stream.write('Ignoring Feature ID %s because: %s\n' % (feat.fid, msg))
else:
# Constructing the model using the keyword args
is_update = False
if self.unique:
# If we want unique models on a particular field, handle the
# geometry appropriately.
try:
# Getting the keyword arguments and retrieving
# the unique model.
u_kwargs = self.unique_kwargs(kwargs)
m = self.model.objects.using(self.using).get(**u_kwargs)
is_update = True
# Getting the geometry (in OGR form), creating
# one from the kwargs WKT, adding in additional
# geometries, and update the attribute with the
# just-updated geometry WKT.
geom = getattr(m, self.geom_field).ogr
new = OGRGeometry(kwargs[self.geom_field])
for g in new: geom.add(g)
setattr(m, self.geom_field, geom.wkt)
except ObjectDoesNotExist:
# No unique model exists yet, create.
m = self.model(**kwargs)
else:
m = self.model(**kwargs)
try:
# Attempting to save.
pippo = kwargs
num_saved += 1
if verbose: stream.write('%s: %s\n' % (is_update and 'Updated' or 'Saved', m))
except SystemExit:
raise
except Exception, msg:
if self.transaction_mode == 'autocommit':
# Rolling back the transaction so that other model saves
# will work.
transaction.rollback_unless_managed()
if strict:
# Bailing out if the `strict` keyword is set.
if not silent:
stream.write('Failed to save the feature (id: %s) into the model with the keyword arguments:\n' % feat.fid)
stream.write('%s\n' % kwargs)
raise
elif not silent:
stream.write('Failed to save %s:\n %s\nContinuing\n' % (kwargs, msg))
# Printing progress information, if requested.
if progress and num_feat % progress_interval == 0:
stream.write('Processed %d features, saved %d ...\n' % (num_feat, num_saved))
# Only used for status output purposes -- incremental saving uses the
# values returned here.
return pippo
nfeat = self.layer.num_feat
if step and isinstance(step, int) and step < nfeat:
# Incremental saving is requested at the given interval (step)
if default_range:
raise LayerMapError('The `step` keyword may not be used in conjunction with the `fid_range` keyword.')
beg, num_feat, num_saved = (0, 0, 0)
indices = range(step, nfeat, step)
n_i = len(indices)
for i, end in enumerate(indices):
# Constructing the slice to use for this step; the last slice is
# special (e.g, [100:] instead of [90:100]).
if i + 1 == n_i: step_slice = slice(beg, None)
else: step_slice = slice(beg, end)
try:
pippo = _get_values(step_slice, num_feat, num_saved)
beg = end
except:
stream.write('%s\nFailed to save slice: %s\n' % ('=-' * 20, step_slice))
raise
else:
# Otherwise, just calling the previously defined _save() function.
return _get_values()
In a custom save or save_model method you can then use:
track_mapping = {'nome': 'name',
'track' : 'MULTILINESTRING'}
targetPath = "/my/gpx/file/path.gpx"
gpx_file = DataSource(targetPath)
mytrack = MyMapping(GPXTrack, gpx_file, track_mapping, layer='tracks')
pippo = mytrack.get_values()
obj.track = pippo['track']