Related
I'm trying to use Cython (Cython version 0.29.30) to speed up my computations in Mac M1, and testing the .pyx by pyximport, but I got:
/Users/xxxx/.pyxbld/temp.macosx-10.9-universal2-3.10/pyrex/binomial.c:697:10: fatal error: 'ios' file not found
#include "ios"
^~~~~
1 error generated.
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/unixccompiler.py", line 117, in _compile
self.spawn(compiler_so + cc_args + [src, '-o', obj] +
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/ccompiler.py", line 910, in spawn
spawn(cmd, dry_run=self.dry_run)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/spawn.py", line 91, in spawn
raise DistutilsExecError(
distutils.errors.DistutilsExecError: command '/usr/bin/clang' failed with exit code 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 214, in load_module
so_path = build_module(module_name, pyxfilename, pyxbuild_dir,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 186, in build_module
so_path = pyxbuild.pyx_to_dll(pyxfilename, extension_mod,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyxbuild.py", line 102, in pyx_to_dll
dist.run_commands()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/dist.py", line 966, in run_commands
self.run_command(cmd)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/dist.py", line 985, in run_command
cmd_obj.run()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/Cython/Distutils/old_build_ext.py", line 186, in run
_build_ext.build_ext.run(self)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 340, in run
self.build_extensions()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/Cython/Distutils/old_build_ext.py", line 195, in build_extensions
_build_ext.build_ext.build_extensions(self)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 449, in build_extensions
self._build_extensions_serial()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 474, in _build_extensions_serial
self.build_extension(ext)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 529, in build_extension
objects = self.compiler.compile(sources,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/ccompiler.py", line 574, in compile
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/unixccompiler.py", line 120, in _compile
raise CompileError(msg)
distutils.errors.CompileError: command '/usr/bin/clang' failed with exit code 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 459, in load_module
module = load_module(fullname, self.path,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 231, in load_module
raise exc.with_traceback(tb)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 214, in load_module
so_path = build_module(module_name, pyxfilename, pyxbuild_dir,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyximport.py", line 186, in build_module
so_path = pyxbuild.pyx_to_dll(pyxfilename, extension_mod,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pyximport/pyxbuild.py", line 102, in pyx_to_dll
dist.run_commands()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/dist.py", line 966, in run_commands
self.run_command(cmd)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/dist.py", line 985, in run_command
cmd_obj.run()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/Cython/Distutils/old_build_ext.py", line 186, in run
_build_ext.build_ext.run(self)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 340, in run
self.build_extensions()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/Cython/Distutils/old_build_ext.py", line 195, in build_extensions
_build_ext.build_ext.build_extensions(self)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 449, in build_extensions
self._build_extensions_serial()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 474, in _build_extensions_serial
self.build_extension(ext)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/command/build_ext.py", line 529, in build_extension
objects = self.compiler.compile(sources,
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/ccompiler.py", line 574, in compile
self._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/distutils/unixccompiler.py", line 120, in _compile
raise CompileError(msg)
ImportError: Building module binomial failed: ["distutils.errors.CompileError: command '/usr/bin/clang' failed with exit code 1\n"]
I'm quite new in Cython. When I ran cython xxx.pyx directly, it worked. The answers in this question are great help, but I was confused and sought for a better solution. I tried the first answer that tried to use C++ compiler:
>>> script_args = ["--cython-cplus"]
>>> setup_args = {"script_args": script_args}
>>> pyximport.install(setup_args=setup_args, language_level=3)
but got:
distutils.errors.DistutilsExecError: command '/usr/bin/clang++' failed with exit code 1
So I tried the second answer. It works fine. This means that I have to create a .pyxbld file for each .pyx file, which is quite annoying.
How can I solve this?
Info on our data flow pipeline we're referring to in this incident:
pipeline is responsible for moving data from Oracle source to BigQuery;
pipeline is written in Python3.6;
it uses ojdbc, jdk and jaydebeapi;
it is ensured in our code that all required libraries etc. are installed always on all the Data Flow workers before execution.
Problem description:
21/10 we experienced problem with Data Flow worker (in europe-west3 region) - see below log. It seems it couldn't load or use jaydebeapi library.
2020-10-21 17:28:42.792 CESTError message from worker: Traceback (most recent call last): File "apache_beam/runners/common.py", line 997, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method File "apache_beam/runners/common.py", line 490, in apache_beam.runners.common.DoFnInvoker.invoke_start_bundle File "apache_beam/runners/common.py", line 496, in apache_beam.runners.common.DoFnInvoker.invoke_start_bundle File "/usr/local/lib/python3.7/site-packages/libs/dataflow/common.py", line 269, in start_bundle jars=[f"/tmp/{self.ojdbc_lib}"] File "/usr/local/lib/python3.7/site-packages/jaydebeapi/init.py", line 412, in connect jconn = _jdbc_connect(jclassname, url, driver_args, jars, libs) File "/usr/local/lib/python3.7/site-packages/jaydebeapi/init.py", line 199, in _jdbc_connect_jpype convertStrings=True) File "/usr/local/lib/python3.7/site-packages/jpype/_core.py", line 216, in startJVM ignoreUnrecognized, convertStrings, interrupt) SystemError: java.lang.ClassNotFoundException: org.jpype.classloader.DynamicClassLoader During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/site-packages/dataflow_worker/batchworker.py", line 638, in do_work work_executor.execute() File "/usr/local/lib/python3.7/site-packages/dataflow_worker/executor.py", line 179, in execute op.start() File "apache_beam/runners/worker/operations.py", line 662, in apache_beam.runners.worker.operations.DoOperation.start File "apache_beam/runners/worker/operations.py", line 664, in apache_beam.runners.worker.operations.DoOperation.start File "apache_beam/runners/worker/operations.py", line 666, in apache_beam.runners.worker.operations.DoOperation.start File "apache_beam/runners/common.py", line 1014, in apache_beam.runners.common.DoFnRunner.start File "apache_beam/runners/common.py", line 999, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method File "apache_beam/runners/common.py", line 1045, in apache_beam.runners.common.DoFnRunner._reraise_augmented File "/usr/local/lib/python3.7/site-packages/future/utils/init.py", line 446, in raise_with_traceback raise exc.with_traceback(traceback) File "apache_beam/runners/common.py", line 997, in apache_beam.runners.common.DoFnRunner._invoke_bundle_method File "apache_beam/runners/common.py", line 490, in apache_beam.runners.common.DoFnInvoker.invoke_start_bundle File "apache_beam/runners/common.py", line 496, in apache_beam.runners.common.DoFnInvoker.invoke_start_bundle File "/usr/local/lib/python3.7/site-packages/libs/dataflow/common.py", line 269, in start_bundle jars=[f"/tmp/{self.ojdbc_lib}"] File "/usr/local/lib/python3.7/site-packages/jaydebeapi/init.py", line 412, in connect jconn = _jdbc_connect(jclassname, url, driver_args, jars, libs) File "/usr/local/lib/python3.7/site-packages/jaydebeapi/init.py", line 199, in _jdbc_connect_jpype convertStrings=True) File "/usr/local/lib/python3.7/site-packages/jpype/_core.py", line 216, in startJVM ignoreUnrecognized, convertStrings, interrupt) SystemError: java.lang.ClassNotFoundException: org.jpype.classloader.DynamicClassLoader [while running 'Read from Oracle source/Read from database']
Problem occurred several times after running exactly same code again and then disappeared and everything worked well with the same code. Do you have any idea what could happen? It seems to us that it was something with infrastructure/worker provisioning etc.
I am successfully starting training jobs in Google Cloud. However, after running for 30 min to 1 hour and several thousand steps they end with an uninformative error message: "CancelledError: Cancelled".
I am training on ~30K images spread over 16 tfrecord files. I do not have this issue when training on a smaller number of images in a single file (~5K or so)
Here are the details:
I start the job using this command:
gcloud ai-platform jobs submit training my_job_name \
--runtime-version 1.13 \
--job-dir=gs://image-training/my_job_dir \
--packages dist/object_detection-0.1.tar.gz,slim/dist/slim-0.1.tar.gz,dist/pycocotools-2.0.tar.gz \
--module-name object_detection.model_main \
--region us-east1 --config object_detection/CLOUDgpu.yaml \
--python-version 3.5 \
-- \
--model_dir gs://image-training/my_job_dir \
--pipeline_config_path=gs://image-training/ssd_inception_v2_coco_2018_01_28/ssd_inception_v2_CLOUD.config
Here is my YAML file:
trainingInput:
runtimeVersion: "1.13"
scaleTier: CUSTOM
masterType: standard_gpu
workerCount: 9
workerType: standard_gpu
parameterServerCount: 3
parameterServerType: standard
My config file references the data files like this:
train_input_reader: {
tf_record_input_reader {
input_path: "gs://image-training/t0423data/train_*_re.tfrecord"
}
num_readers:3
label_map_path: "gs://image-training/PigCount/label_map.pbtxt"
}
Finally, the full error:
The replica worker 6 exited with a non-zero status of 1. Termination reason:
Error. Traceback (most recent call last): [...] saving_listeners) File
"/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py",
line 1407, in _train_with_estimator_spec _, loss =
mon_sess.run([estimator_spec.train_op, estimator_spec.loss]) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 676, in run run_metadata=run_metadata) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 1171, in run run_metadata=run_metadata) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 1270, in run raise six.reraise(*original_exc_info) File
"/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise raise
value File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 1255, in run return self._sess.run(*args, **kwargs) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 1327, in run run_metadata=run_metadata) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py",
line 1091, in run return self._sess.run(*args, **kwargs) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py",
line 929, in run run_metadata_ptr) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py",
line 1152, in _run feed_dict_tensor, options, run_metadata) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py",
line 1328, in _do_run run_metadata) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py",
line 1348, in _do_call raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.CancelledError: Cancelled To find out
more about why your job exited please check the logs:
https://console.cloud.google.com/logs/viewer?project=226138759195&resource=ml_job%2Fjob_id%2Ft_05_01_big_data1&advancedFilter=resource.type%3D%22ml_job%22%0Aresource.labels.job_id%3D%22t_05_01_big_data1%22
In the logs Replica 6 shows these errors:
command '['python3', '-m', 'object_detection.model_main', '--model_dir', 'gs://image-training/my_job_dir', '--pipeline_config_path=gs://image-training/ssd_inception_v2_coco_2018_01_28/ssd_inception_v2_CLOUD.config', '--job-dir', 'gs://image-training/my_job_dir']' returned non-zero exit status 1
And just before that:
worker-replica-6
Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1334, in _do_call return fn(*args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1319, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.CancelledError: Cancelled During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main "__main__", mod_spec) File "/usr/lib/python3.5/runpy.py", line 85, in _run_code exec(code, run_globals) File "/root/.local/lib/python3.5/site-packages/object_detection/model_main.py", line 109, in <module> tf.app.run() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 125, in run _sys.exit(main(argv)) File "/root/.local/lib/python3.5/site-packages/object_detection/model_main.py", line 105, in main tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0]) File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/training.py", line 471, in train_and_evaluate return executor.run() File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/training.py", line 638, in run getattr(self, task_to_run)() File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/training.py", line 648, in run_worker return self._start_distributed_training() File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/training.py", line 789, in _start_distributed_training saving_listeners=saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 358, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1124, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1158, in _train_model_default saving_listeners) File "/usr/local/lib/python3.5/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1407, in _train_with_estimator_spec _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss]) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 676, in run run_metadata=run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1171, in run run_metadata=run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1270, in run raise six.reraise(*original_exc_info) File "/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise raise value File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1255, in run return self._sess.run(*args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1327, in run run_metadata=run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/monitored_session.py", line 1091, in run return self._sess.run(*args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 929, in run run_metadata_ptr) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1152, in _run feed_dict_tensor, options, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1328, in _do_run run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1348, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.CancelledError: Cancelled
Any idea how I can keep these jobs from failing?
I seem to have solved this issue by increasing the number and power of machines that I'm using. I changed the YAML file to this and it's run for 50,000 steps with no problems. Way more expensive, but at least it works!:
trainingInput:
scaleTier: CUSTOM
# Configure a master worker with 4 K80 GPUs
masterType: n1-highcpu-16
masterConfig:
acceleratorConfig:
count: 4
type: NVIDIA_TESLA_K80
# Configure 9 workers, each with 4 K80 GPUs
workerCount: 9
workerType: n1-highcpu-16
workerConfig:
acceleratorConfig:
count: 4
type: NVIDIA_TESLA_K80
# Configure 3 parameter servers with no GPUs
parameterServerCount: 3
parameterServerType: n1-highmem-8
See this page for full explaination: https://cloud.google.com/ml-engine/docs/tensorflow/using-gpus
I don't know if this is relevant, but the supercomputer operators conducted an upgrade of Booster module yesterday. After that my tensorflow scripts, which were working perfectly fine before that, raise the following error:
2018-06-30 02:21:11.787262: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Traceback (most recent call last):
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 524, in <module>
serial_RF=False))
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 189, in train_MLP
MLP = deepMetaPredictor().combinePredictors_datatypes(datasets, mat, serial_RF=serial_RF, META_ZCUTOFF=datasets.args.META_ZCUTOFF)
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/deepMetaPredictor.py", line 169, in combinePredictors_datatypes
mlp.fit(datasets.x_crossval['lhl'], datasets.y_crossval['lhl'])
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/ANN_functions.py", line 324, in fit
_, c, p = self.sess.run([self.optimizer, self.cost, self.pred], feed_dict={self.x: batch_x, self.y: batch_y})
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
InvalidArgumentError: Expected size[0] in [0, 150], but got 300
[[Node: Slice = Slice[Index=DT_INT32, T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_Placeholder_1_0_1, Slice/begin, gradients/sub_grad/Shape_1)]]
Caused by op u'Slice', defined at:
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/_control.py", line 127, in runFuture
future.resultValue = future.callable(*future.args, **future.kargs)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 252, in run_path
return _run_module_code(code, init_globals, run_name, path_name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 82, in _run_module_code
mod_name, mod_fname, mod_loader, pkg_name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 524, in <module>
serial_RF=False))
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 189, in train_MLP
MLP = deepMetaPredictor().combinePredictors_datatypes(datasets, mat, serial_RF=serial_RF, META_ZCUTOFF=datasets.args.META_ZCUTOFF)
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/deepMetaPredictor.py", line 167, in combinePredictors_datatypes
random_state=datasets.random_state)
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/ANN_functions.py", line 235, in __init__
self.cost = tf_group_RMSE(self.y, self.pred, matrices.assaysize_vec, matrices.group_matrix) \
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/ConsScoreTK_Statistics.py", line 1261, in tf_group_RMSE
Y = tf.slice(Y, [0], [b_molnum])
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 650, in slice
return gen_array_ops._slice(input_, begin, size, name=name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 7093, in _slice
"Slice", input=input, begin=begin, size=size, name=name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Expected size[0] in [0, 150], but got 300
[[Node: Slice = Slice[Index=DT_INT32, T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_Placeholder_1_0_1, Slice/begin, gradients/sub_grad/Shape_1)]]
Traceback (most recent call last):
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
"__main__", fname, loader, pkg_name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/bootstrap/__main__.py", line 302, in <module>
b.main()
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/bootstrap/__main__.py", line 92, in main
self.run()
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/bootstrap/__main__.py", line 290, in run
futures_startup()
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/bootstrap/__main__.py", line 271, in futures_startup
run_name="__main__"
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/futures.py", line 64, in _startup
result = _controller.switch(rootFuture, *args, **kargs)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/_control.py", line 253, in runController
raise future.exceptionValue
tensorflow.python.framework.errors_impl.InvalidArgumentError: Expected size[0] in [0, 150], but got 300
[[Node: Slice = Slice[Index=DT_INT32, T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_Placeholder_1_0_1, Slice/begin, gradients/sub_grad/Shape_1)]]
Caused by op u'Slice', defined at:
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/scoop/_control.py", line 127, in runFuture
future.resultValue = future.callable(*future.args, **future.kargs)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 252, in run_path
return _run_module_code(code, init_globals, run_name, path_name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 82, in _run_module_code
mod_name, mod_fname, mod_loader, pkg_name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/runpy.py", line 72, in _run_code
exec code in run_globals
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 524, in <module>
serial_RF=False))
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/bin/deep-ScaffOpt.py", line 189, in train_MLP
MLP = deepMetaPredictor().combinePredictors_datatypes(datasets, mat, serial_RF=serial_RF, META_ZCUTOFF=datasets.args.META_ZCUTOFF)
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/deepMetaPredictor.py", line 167, in combinePredictors_datatypes
random_state=datasets.random_state)
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/ANN_functions.py", line 235, in __init__
self.cost = tf_group_RMSE(self.y, self.pred, matrices.assaysize_vec, matrices.group_matrix) \
File "/homeb/eusmi01/eusmi0100/Programs/consscortk/lib/ConsScoreTK_Statistics.py", line 1261, in tf_group_RMSE
Y = tf.slice(Y, [0], [b_molnum])
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 650, in slice
return gen_array_ops._slice(input_, begin, size, name=name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 7093, in _slice
"Slice", input=input, begin=begin, size=size, name=name)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/homeb/eusmi01/eusmi0100/Programs/Miniconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Expected size[0] in [0, 150], but got 300
[[Node: Slice = Slice[Index=DT_INT32, T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_Placeholder_1_0_1, Slice/begin, gradients/sub_grad/Shape_1)]]
The version of Tensorflow that I use is 1.8.0. For the record, the same code works perfectly fine on my laptop where I have 1.4.0-dev version installed. Could anyone enlighten me about the source of the error?
I am using the flower tutorial code in cloudml-samples trying to implement a multi-label classification on a set of restaurant photos.
I have the dict.txt and input updated accordingly and here are the sample lines.
dict.txt
good_for_lunch
good_for_dinner
takes_reservations
outdoor_seating
restaurant_is_expensive
has_alcohol
has_table_service
ambience_is_classy
good_for_kids
eval_set.csv
...
gs://yelp_restaurant_photo_classification/train_photos/312753.jpg,good_for_dinner,takes_reservations,has_alcohol,has_table_service,good_for_kids
gs://yelp_restaurant_photo_classification/train_photos/342651.jpg,good_for_lunch,good_for_dinner,outdoor_seating,good_for_kids
gs://yelp_restaurant_photo_classification/train_photos/217079.jpg,takes_reservations,has_table_service
...
Preprocess job started running fine, then I see this specific error keeps coming up, until job failed.
python trainer/preprocess.py \
--input_dict "$DICT_FILE" \
--input_path "gs://yelp_restaurant_photo_classification/labels/eval_set.csv" \
--output_path "${GCS_PATH}/preproc/eval" \
--cloud
Job Logs - KeyError: u"FALSE [while running 'Extract label ids']"
(d8285fa55cb6ab07): Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 514, in do_work
work_executor.execute()
File "dataflow_worker/executor.py", line 894, in dataflow_worker.executor.MapTaskExecutor.execute (dataflow_worker/executor.c:24204)
op.start()
File "dataflow_worker/executor.py", line 197, in dataflow_worker.executor.ReadOperation.start (dataflow_worker/executor.c:7039)
def start(self):
File "dataflow_worker/executor.py", line 202, in dataflow_worker.executor.ReadOperation.start (dataflow_worker/executor.c:6946)
with self.spec.source.reader() as reader:
File "dataflow_worker/executor.py", line 212, in dataflow_worker.executor.ReadOperation.start (dataflow_worker/executor.c:6891)
self.output(windowed_value)
File "dataflow_worker/executor.py", line 142, in dataflow_worker.executor.Operation.output (dataflow_worker/executor.c:5249)
cython.cast(Receiver, self.receivers[output_index]).receive(windowed_value)
File "dataflow_worker/executor.py", line 89, in dataflow_worker.executor.ConsumerSet.receive (dataflow_worker/executor.c:3487)
cython.cast(Operation, consumer).process(windowed_value)
File "dataflow_worker/executor.py", line 500, in dataflow_worker.executor.DoOperation.process (dataflow_worker/executor.c:14239)
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 134, in apache_beam.runners.common.DoFnRunner.receive (apache_beam/runners/common.c:4172)
self.process(windowed_value)
File "apache_beam/runners/common.py", line 168, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:5282)
self.reraise_augmented(exn)
File "apache_beam/runners/common.py", line 181, in apache_beam.runners.common.DoFnRunner.reraise_augmented (apache_beam/runners/common.c:5665)
raise
File "apache_beam/runners/common.py", line 166, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:5218)
self._process_outputs(element, self.dofn_process(self.context))
File "apache_beam/runners/common.py", line 222, in apache_beam.runners.common.DoFnRunner._process_outputs (apache_beam/runners/common.c:6400)
self.main_receivers.receive(windowed_value)
File "dataflow_worker/executor.py", line 89, in dataflow_worker.executor.ConsumerSet.receive (dataflow_worker/executor.c:3487)
cython.cast(Operation, consumer).process(windowed_value)
File "dataflow_worker/executor.py", line 500, in dataflow_worker.executor.DoOperation.process (dataflow_worker/executor.c:14239)
self.dofn_receiver.receive(o)
File "apache_beam/runners/common.py", line 134, in apache_beam.runners.common.DoFnRunner.receive (apache_beam/runners/common.c:4172)
self.process(windowed_value)
File "apache_beam/runners/common.py", line 168, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:5282)
self.reraise_augmented(exn)
File "apache_beam/runners/common.py", line 179, in apache_beam.runners.common.DoFnRunner.reraise_augmented (apache_beam/runners/common.c:5646)
raise type(exn), args, sys.exc_info()[2]
File "apache_beam/runners/common.py", line 166, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:5218)
self._process_outputs(element, self.dofn_process(self.context))
File "apache_beam/runners/common.py", line 191, in apache_beam.runners.common.DoFnRunner._process_outputs (apache_beam/runners/common.c:5838)
for result in results:
File "trainer/preprocess.py", line 130, in process
KeyError: u"FALSE [while running 'Extract label ids']"
Job Logs - Workflow failed
(f3c7c09c0b6a453c): Workflow failed. Causes: (688819c5d32d79c8): S06:Read input+Parse input+Extract label ids+Read and convert to JPEG+Embed and make TFExample+Save to disk/Write to gs:__yelp_restaurant_photo_classification_yelp_restaurant_photo_classification_preproc_eval/Write/WriteImpl/write_bundles+Save to disk/Write to gs:__yelp_restaurant_photo_classification_yelp_restaurant_photo_classification_preproc_eval/Write/WriteImpl/pair+Save to disk/Write to gs:__yelp_restaurant_photo_classification_yelp_restaurant_photo_classification_preproc_eval/Write/WriteImpl/WindowInto+Save to disk/Write to gs:__yelp_restaurant_photo_classification_yelp_restaurant_photo_classification_preproc_eval/Write/WriteImpl/GroupByKey/Reify+Save to disk/Write to gs:__yelp_restaurant_photo_classification_yelp_restaurant_photo_classification_preproc_eval/Write/WriteImpl/GroupByKey/Write failed.
You probably have a row in your input CSV file where the label is 'FALSE', but 'FALSE' is not in 'dict.txt'.