I'm training a CNN with word embeddings and for some reason I'm getting FailedPreconditionError exception whenever I try to save a frozen version of the model for later use.
This is despite the fact that I call sess.run(tf.global_variables_initializer()) just before training and I have no problem training and checkpointing the model.
The problem occurs when I try to load a model from a checkpoint and save a frozen model. The function I'm using is as follows:
def freeze_model(checkpoint_path, model_save_path, output_node_names):
checkpoint = tf.train.get_checkpoint_state(checkpoint_path)
input_checkpoint = checkpoint.model_checkpoint_path
saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
graph = tf.get_default_graph()
input_graph_def = graph.as_graph_def()
with tf.Session() as sess:
saver.restore(sess, input_checkpoint)
output_graph_def = graph_util.convert_variables_to_constants(
sess,
input_graph_def,
output_node_names
)
with tf.gfile.GFile(model_save_path, "wb") as f:
f.write(output_graph_def.SerializeToString())
The error I get is:
Traceback (most recent call last):
File "myproject/train.py", line 522, in <module>
tf.app.run()
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 44, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "myproject/train.py", line 518, in main
trainer.save_model(preprocessor)
File "myproject/train.py", line 312, in save_model
ut.freeze_model(self.checkpoint_dir, model_save_path, C.OUTPUT_NODE_NAMES)
File "/home/foo/anaconda2/lib/python2.7/site-packages/myproject/utils.py", line 224, in freeze_model
output_node_names
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/graph_util_impl.py", line 218, in convert_variables_to_constants
returned_variables = sess.run(variable_names)
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/home/foo/anaconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.FailedPreconditionError: Attempting to use uninitialized value embeddings/W
[[Node: embeddings/W/_20 = _Send[T=DT_FLOAT, client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_30_embeddings/W", _device="/job:localhost/replica:0/task:0/gpu:0"](embeddings/W)]]
[[Node: conv_maxpool_4/W/_17 = _Recv[_start_time=0, client_terminated=false, recv_device="/job:localhost/replica:0/task:0/cpu:0", send_device="/job:localhost/replica:0/task:0/gpu:0", send_device_incarnation=1, tensor_name="edge_26_conv_maxpool_4/W", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Turns out I was constructing a Saver object before I made a Session so nothing from the session was being saved.
Related
I have a small task that reads a svg file from a path and uses cairo svg2pdf to convert the svg to pdf. If I run the function without using celery delay then the function runs fine and converts the file to pdf. If I run the function as a celery task then I get some errors:
Traceback (most recent call last):
File "/..../tasks.py", line 24, in create_download_pdf
svg2pdf(bytestring=bytestring, write_to=completeName)
File "/.../lib/python3.10/site-packages/cairosvg/__init__.py", line 67, in svg2pdf
return surface.PDFSurface.convert(
File "/.../lib/python3.10/site-packages/cairosvg/surface.py", line 131, in convert
instance = cls(
File "/.../lib/python3.10/site-packages/cairosvg/surface.py", line 202, in __init__
self.cairo, self.width, self.height = self._create_surface(
File "/.../lib/python3.10/site-packages/cairosvg/surface.py", line 242, in _create_surface
cairo_surface = self.surface_class(self.output, width, height)
File "/.../lib/python3.10/site-packages/cairocffi/surfaces.py", line 876, in __init__
Surface.__init__(self, pointer, target_keep_alive=write_func)
File "/.../lib/python3.10/site-packages/cairocffi/surfaces.py", line 158, in __init__
self._check_status()
File "/../lib/python3.10/site-packages/cairocffi/surfaces.py", line 170, in _check_status
_check_status(cairo.cairo_surface_status(self._pointer))
File "/../lib/python3.10/site-packages/cairocffi/__init__.py", line 88, in _check_status
raise exception(message, status)
OSError: [Errno cairo returned CAIRO_STATUS_WRITE_ERROR: b'error while writing to output stream'] 11
Here is the function:
#app.task(name="create_download_pdf")
def create_download_pdf(folder_path: str, svg_data=None, filename=None, file_path=None) -> None:
try:
completeName = os.path.join(folder_path, f"{filename}.pdf")
if svg_data:
svg2pdf(bytestring=svg_data, write_to=completeName)
elif file_path:
with open(file_path, 'r') as f:
bytestring=f.read()
print(bytestring)
svg2pdf(bytestring=bytestring, write_to=completeName)
except (OSError, ValueError):
logger.exception(
f"PDF creation and download exception: Unable to download | create PDF from svg data for {str(filename) or ''}. \n \
{traceback.format_exc()}"
)
pass
How can this be solved. I am not sending a file to celery task, therefore, its not that problem.
I used Vertex AI Pipelines to custom train tabular data.
I ran the python code below.
I CREATE RUN the pipeline with the generated json.
The following error occurred at the start of the training.
Why were tabular data sets treated as image data sets? what is wrong?
Environment
Python 3.7.3
kfp==1.6.2
kfp-pipeline-spec==0.1.7
kfp-server-api==1.6.0
Error message
ValueError: ImageDataset class can not be used to retrieve dataset resource projects/nnnnnnnnnnnn/locations/us-central1/datasets/3781554739456507904, check the dataset type
f"{self.__class__.__name__} class can not be used to retrieve "
File "/opt/python3.7/lib/python3.7/site-packages/google/cloud/aiplatform/datasets/dataset.py", line 100, in _validate_metadata_schema_uri
self._validate_metadata_schema_uri()
File "/opt/python3.7/lib/python3.7/site-packages/google/cloud/aiplatform/datasets/dataset.py", line 82, in __init__
return annotation_type(value)
File "/opt/python3.7/lib/python3.7/site-packages/google_cloud_pipeline_components/aiplatform/remote_runner.py", line 176, in cast
value = cast(value, param_type)
File "/opt/python3.7/lib/python3.7/site-packages/google_cloud_pipeline_components/aiplatform/remote_runner.py", line 205, in prepare_parameters
prepare_parameters(serialized_args[METHOD_KEY], method, is_init=False)
File "/opt/python3.7/lib/python3.7/site-packages/google_cloud_pipeline_components/aiplatform/remote_runner.py", line 236, in runner
print(runner(args.cls_name, args.method_name, executor_input, kwargs))
File "/opt/python3.7/lib/python3.7/site-packages/google_cloud_pipeline_components/aiplatform/remote_runner.py", line 280, in main
main()
File "/opt/python3.7/lib/python3.7/site-packages/google_cloud_pipeline_components/aiplatform/remote_runner.py", line 284, in <module>
exec(code, run_globals)
File "/opt/python3.7/lib/python3.7/runpy.py", line 85, in _run_code
"__main__", mod_spec)
File "/opt/python3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
Traceback (most recent call last):
Python code:
import datetime
from kfp.v2 import dsl, compiler
from kfp.v2.google.client import AIPlatformClient
import google_cloud_pipeline_components.aiplatform as gcc_ai
PROJECT = "my-project"
PIPELINE_NAME = "test-pipeline"
PIPELINE_ROOT_PATH = f"gs://test-pipeline-20210525/{PIPELINE_NAME}"
#dsl.pipeline(
name=PIPELINE_NAME,
pipeline_root=PIPELINE_ROOT_PATH
)
def test_pipeline(
display_name: str=f"{PIPELINE_NAME}-2021MMDD-nn"
):
dataset_create_op = gcc_ai.TabularDatasetCreateOp(
project=PROJECT, display_name=display_name,
gcs_source="gs://used_apartment/datasource/train.csv"
)
training_job_run_op = gcc_ai.CustomContainerTrainingJobRunOp(
project=PROJECT, display_name=display_name,
container_uri="us-central1-docker.pkg.dev/my-project/dataops-rc2021/custom-train:latest",
staging_bucket="vertex_ai_staging_rc2021",
base_output_dir="gs://used_apartment/cstm_img_scrf/artifact",
model_serving_container_image_uri="us-central1-docker.pkg.dev/my-project/dataops-rc2021/custom-pred:latest",
model_serving_container_predict_route="/",
model_serving_container_health_route="/health",
model_serving_container_ports=[8080],
training_fraction_split=0.8,
validation_fraction_split=0.1,
test_fraction_split=0.1,
dataset=dataset_create_op.outputs["dataset"]
)
def run_pipeline(event=None, context=None):
# Compile the pipeline using the kfp.v2.compiler.Compiler
compiler.Compiler().compile(
pipeline_func=test_pipeline,
package_path="test-pipeline.json"
)
if __name__ == '__main__':
run_pipeline()
This seems to be a bug in CustomContainerTrainingJobRunOp component code. We were able to reproduce the error.
I have created the tracking bug https://github.com/kubeflow/pipelines/issues/5885.
I wanted to do a classification model using Google ML Engine. I took this documentation as reference and successfully followed this post till preprocessing step.
After executing the training command, I'm getting the following error message:
The replica master 0 exited with a non-zero status of 1. Traceback (most recent call last): [...]
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 436, in run dispatch(args, model, cluster, task)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 477, in dispatch Trainer(args, model, cluster, task).run_training()
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 241, in run_training self.eval(session)
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 283, in eval ` self.model.format_metric_values(self.evaluator.evaluate()))
File "/root/.local/lib/python2.7/site-packages/trainer/task.py", line 57, in evaluate self.eval_batch_size)
File "/root/.local/lib/python2.7/site-packages/trainer/model.py", line 300, in build_eval_graph return self.build_graph(data_paths, batch_size, GraphMod.EVALUATE)
File "/root/.local/lib/python2.7/site-packages/trainer/model.py", line 224, in build_graph tensors.examples = tf.placeholder(tf.string, name='input', shape=(None,))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/array_ops.py", line 1502, in placeholder name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 2149, in _placeholder name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in __init__ self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input' with dtype string
[[Node: input = Placeholder[dtype=DT_STRING, shape=[], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]
Kindly help me out with this..
I have follow code in python:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit( train_data_features, train["sentiment"] )
but have key error for "sentiment", I don't know why,
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
-Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site--packages/pandas/core/frame.py", line 1780, in __getitem__
return self._getitem_column(key)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.py", line 1787, in _getitem_column
return self._get_item_cache(key)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.py", line 1068, in _get_item_cache
values = self._data.get(item)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/internals.py", line 2849, in get
loc = self.items.get_loc(item)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/index.py", line 1402, in get_loc
return self._engine.get_loc(_values_from_object(key))
File "pandas/index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas/index.c:3807)
File "pandas/index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas/index.c:3687)
File "pandas/hashtable.pyx", line 696, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12310)
File "pandas/hashtable.pyx", line 704, in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12261)
KeyError: 'sentiment'
Are you doing the Kaggle competition? https://www.kaggle.com/c/word2vec-nlp-tutorial/data
Are you sure you have downloaded and decompressed the file ok? The first part of the file reads:
id sentiment review
"5814_8" 1 "With all this stuff go
This works for me:
>>> train = pd.read_csv("labeledTrainData.tsv", delimiter="\t")
>>> train.columns
Index([u'id', u'sentiment', u'review'], dtype='object')
>>> train.head(3)
id sentiment review
0 5814_8 1 With all this stuff going down at the moment w...
1 2381_9 1 \The Classic War of the Worlds\" by Timothy Hi...
2 7759_3 0 The film starts with a manager (Nicholas Bell)...
You should check the columns are setup correctly in the train variable. You should have a sentiment column. That column seems to be missing in your dataframe.
The whole error is:
Traceback (most recent call last):
File "C:/Documents and Settings/aaa/trysphinx/views.py", line 30, in <module>
print list(results)
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\models.py", line 243, in __iter__
return iter(self._get_data())
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\models.py", line 422, in _get_data
self._result_cache = list(self._get_results())
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\models.py", line 557, in _get_results
results = self._get_sphinx_results()
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\models.py", line 529, in _get_sphinx_results
results = client.Query(self._query, self._index)
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\apis\api263\__init__.py", line 388, in Query
response = self._GetResponse(sock, VER_COMMAND_SEARCH)
File "C:\Python27\lib\site-packages\django_sphinx-2.2.4-py2.7.egg\djangosphinx\apis\api263\__init__.py", line 144, in _GetResponse
chunk = sock.recv(left)
MemoryError
Please help me.
You're loading more data into memory than your computer can handle. Perhaps rather than:
print list(results)
…do:
for r in results:
print r