I have a RDD object(bid) that get from map, and then I just do a simple bid.count(). Then I received the following error. Can someone tell me what this error might be? I try to google what it means but got no answer.
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-10-0283df3b1938> in <module>()
----> 1 bid.count()
/home/username/spark-1.0.0-bin-hadoop2/python/pyspark/rdd.pyc in count(self)
706 3
707 """
--> 708 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
709
710 def stats(self):
/home/username/spark-1.0.0-bin-hadoop2/python/pyspark/rdd.pyc in sum(self)
697 6.0
698 """
--> 699 return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)
700
701 def count(self):
/home/username/spark-1.0.0-bin-hadoop2/python/pyspark/rdd.pyc in reduce(self, f)
617 if acc is not None:
618 yield acc
--> 619 vals = self.mapPartitions(func).collect()
620 return reduce(f, vals)
621
/home/username/spark-1.0.0-bin-hadoop2/python/pyspark/rdd.pyc in collect(self)
581 """
582 with _JavaStackTrace(self.context) as st:
--> 583 bytesInJava = self._jrdd.collect().iterator()
584 return list(self._collect_iterator_through_file(bytesInJava))
585
/home/username/spark-1.0.0-bin-hadoop2/python/lib/py4j-0.8.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
535 answer = self.gateway_client.send_command(command)
536 return_value = get_return_value(answer, self.gateway_client,
--> 537 self.target_id, self.name)
538
539 for temp_arg in temp_args:
/home/username/spark-1.0.0-bin-hadoop2/python/lib/py4j-0.8.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o148.collect.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1.0:9 failed 4 times, most recent failure: TID 7 on host 08.bm-hadoope-datanode.dev.lax1 failed for unknown reason
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
And then I join the bid table with another table, then get a new RDD. It seems going smooth, but when I do first() for this new RDD. I received the following error:
Py4JJavaError: An error occurred while calling o274.collectPartitions.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9.0:6 failed 4 times, most recent failure: Exception failure in TID 32 on host 05.bm-hadoope-datanode.dev.lax1: java.io.IOException: Filesystem closed
org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:629)
org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:735)
org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:793)
java.io.DataInputStream.readFully(DataInputStream.java:195)
java.io.DataInputStream.readFully(DataInputStream.java:169)
parquet.hadoop.ParquetFileReader$ConsecutiveChunkList.readAll(ParquetFileReader.java:599)
parquet.hadoop.ParquetFileReader.readNextRowGroup(ParquetFileReader.java:360)
parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:100)
parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:172)
parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:130)
org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:122)
org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:293)
org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply$mcV$sp(PythonRDD.scala:200)
org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:175)
org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:175)
org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1160)
org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:174)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633)
at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498)
at akka.actor.ActorCell.invoke(ActorCell.scala:456)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237)
at akka.dispatch.Mailbox.run(Mailbox.scala:219)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Related
I am following this github repo, adopting it to a text classification problem that is built on distil bert. So given a sting of text, the model should return a label and a (probability) score.
Output from the model:
sentiment_input = {"inputs": "I love using the new Inference DLC."}
# sentiment_input= "I love using the new Inference DLC."
response = predictor.predict(data=sentiment_input)
print(response)
Output:
[{'label': 'LABEL_80', 'score': 0.008507220074534416}]
When I run the following
# Create an enpointInput
endpointInput = EndpointInput(
endpoint_name=predictor.endpoint_name,
probability_attribute="score",
inference_attribute="label",
# probability_threshold_attribute=0.5,
destination="/opt/ml/processing/input_data",
)
# Create the monitoring schedule to execute every hour.
from sagemaker.model_monitor import CronExpressionGenerator
response = clinc_intent0911.create_monitoring_schedule(
monitor_schedule_name=clincintent_monitor_schedule_name,
endpoint_input=endpointInput,
output_s3_uri=baseline_results_uri,
problem_type="MulticlassClassification",
ground_truth_input=ground_truth_upload_path,
constraints=baseline_job.suggested_constraints(),
schedule_cron_expression=CronExpressionGenerator.hourly(),
enable_cloudwatch_metrics=True,
)
I get the following error:
---------------------------------------------------------------------------
ClientError Traceback (most recent call last)
<ipython-input-269-72e7049246fb> in <module>
10 constraints=baseline_job.suggested_constraints(),
11 schedule_cron_expression=CronExpressionGenerator.hourly(),
---> 12 enable_cloudwatch_metrics=True,
13 )
/opt/conda/lib/python3.6/site-packages/sagemaker/model_monitor/model_monitoring.py in create_monitoring_schedule(self, endpoint_input, ground_truth_input, problem_type, record_preprocessor_script, post_analytics_processor_script, output_s3_uri, constraints, monitor_schedule_name, schedule_cron_expression, enable_cloudwatch_metrics)
2615 network_config=self.network_config,
2616 )
-> 2617 self.sagemaker_session.sagemaker_client.create_model_quality_job_definition(**request_dict)
2618
2619 # create schedule
/opt/conda/lib/python3.6/site-packages/botocore/client.py in _api_call(self, *args, **kwargs)
355 "%s() only accepts keyword arguments." % py_operation_name)
356 # The "self" in this scope is referring to the BaseClient.
--> 357 return self._make_api_call(operation_name, kwargs)
358
359 _api_call.__name__ = str(py_operation_name)
/opt/conda/lib/python3.6/site-packages/botocore/client.py in _make_api_call(self, operation_name, api_params)
674 error_code = parsed_response.get("Error", {}).get("Code")
675 error_class = self.exceptions.from_code(error_code)
--> 676 raise error_class(parsed_response, operation_name)
677 else:
678 return parsed_response
ClientError: An error occurred (ValidationException) when calling the CreateModelQualityJobDefinition operation: Endpoint 'clinc-intent-analysis-0911' does not exist or is not valid
At this point my sagemaker endpoint is live and unable to debug it is not valid.
SageMaker ModelMonitor only works for tabular datasets at the moment out of the box (see documentation), and hence the "not valid" error message. To use it on NLP problems, you'd have to bring your own model monitor container (BYOC). Here is an example to get started - https://aws.amazon.com/blogs/machine-learning/detect-nlp-data-drift-using-custom-amazon-sagemaker-model-monitor/,
and the associated Github repo is here - https://github.com/aws-samples/detecting-data-drift-in-nlp-using-amazon-sagemaker-custom-model-monitor
I am using GCP's paid Geocoding API to get the latitude and longitude of around 1 million addresses. Though it is very slow but I can wait. The problem is I am getting SSLError, sometimes after 15000 requests, and sometimes after 6000 requests, or so. I am using Python 2.7 and I have tried to catch the error but it doesn't get resolved.
Error:
SSLErrorTraceback (most recent call last)
<ipython-input-2-7dd6c7aa3195> in <module>()
14 count=0
15 try:
---> 16 location2 = geocoder2.geocode(row,timeout=1,components={"country": "PK","locality":"Sindh"})
17 if location2:
18 print(count2,location2.latitude,location2.longitude)
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/geocoders/googlev3.pyc in geocode(self, query, exactly_one, timeout, bounds, region, components, place_id, language, sensor)
271 logger.debug("%s.geocode: %s", self.__class__.__name__, url)
272 return self._parse_json(
--> 273 self._call_geocoder(url, timeout=timeout), exactly_one
274 )
275
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/geocoders/base.pyc in _call_geocoder(self, url, timeout, raw, requester, deserializer, **kwargs)
398 return page
399
--> 400 page = decode_page(page)
401
402 if deserializer is not None:
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/util.pyc in decode_page(page)
50 else:
51 encoding = page.headers.getparam("charset") or "utf-8"
---> 52 return text_type(page.read(), encoding=encoding)
53 else: # requests?
54 encoding = page.headers.get("charset") or "utf-8"
/usr/lib/python2.7/socket.pyc in read(self, size)
353 while True:
354 try:
--> 355 data = self._sock.recv(rbufsize)
356 except error, e:
357 if e.args[0] == EINTR:
/usr/lib/python2.7/httplib.pyc in read(self, amt)
605 # connection, and the user is reading more bytes than will be provided
606 # (for example, reading in 1k chunks)
--> 607 s = self.fp.read(amt)
608 if not s and amt:
609 # Ideally, we would raise IncompleteRead if the content-length
/usr/lib/python2.7/socket.pyc in read(self, size)
382 # fragmentation issues on many platforms.
383 try:
--> 384 data = self._sock.recv(left)
385 except error, e:
386 if e.args[0] == EINTR:
/usr/lib/python2.7/ssl.pyc in recv(self, buflen, flags)
770 "non-zero flags not allowed in calls to recv() on %s" %
771 self.__class__)
--> 772 return self.read(buflen)
773 else:
774 return self._sock.recv(buflen, flags)
/usr/lib/python2.7/ssl.pyc in read(self, len, buffer)
657 v = self._sslobj.read(len, buffer)
658 else:
--> 659 v = self._sslobj.read(len)
660 return v
661 except SSLError as x:
SSLError: ('The read operation timed out',)
Code I am using:
from geopy.geocoders import GoogleV3
from geopy.exc import GeocoderServiceError, GeocoderTimedOut
from requests.exceptions import SSLError
import time
import pandas as pd
df=pd.read_csv("./sat-data.csv")
df['Address']=''
df['Location']=''
geocoder2 = GoogleV3(api_key="somekey")
count=0
count2=0
for index, row in df.School_Address.iteritems():
count+=1
count2+=1
if count ==4900:
time.sleep(100)
count=0
try:
location2 = geocoder2.geocode(row,timeout=1,components={"country": "PK","locality":"Sindh"})
if location2:
print(count2,location2.latitude,location2.longitude)
df.Address.loc[index]=location2.address
df.Location.loc[index]=(location2.latitude, location2.longitude)
else:
df.Address.loc[index]="None"
df.Location.loc[index]="None"
except (GeocoderServiceError, GeocoderTimedOut, SSLError), e:
print("Error: geocode failed on input")
df.to_csv('./sat-data-new.csv', index=False)
In [16]: r
Out[16]: Redis<ConnectionPool<Connection<host=****,port=*****,db=3>>>
In [17]: r.set('zaza', 'king')
---------------------------------------------------------------------------
ConnectionError Traceback (most recent call last)
<ipython-input-17-8126d1846970> in <module>
----> 1 r.set('zaza', 'king')
/usr/local/lib/python3.7/site-packages/redis/client.py in set(self, name, value, ex, px, nx, xx)
1517 if xx:
1518 pieces.append('XX')
-> 1519 return self.execute_command('SET', *pieces)
1520
1521 def __setitem__(self, name, value):
/usr/local/lib/python3.7/site-packages/redis/client.py in execute_command(self, *args, **options)
834 pool = self.connection_pool
835 command_name = args[0]
--> 836 conn = self.connection or pool.get_connection(command_name, **options)
837 try:
838 conn.send_command(*args)
/usr/local/lib/python3.7/site-packages/redis/connection.py in get_connection(self, command_name, *keys, **options)
1069 try:
1070 # ensure this connection is connected to Redis
-> 1071 connection.connect()
1072 # connections that the pool provides should be ready to send
1073 # a command. if not, the connection was either returned to the
/usr/local/lib/python3.7/site-packages/redis/connection.py in connect(self)
545 self._sock = sock
546 try:
--> 547 self.on_connect()
548 except RedisError:
549 # clean up after any error in on_connect
/usr/local/lib/python3.7/site-packages/redis/connection.py in on_connect(self)
615 # to check the health prior to the AUTH
616 self.send_command('AUTH', self.password, check_health=False)
--> 617 if nativestr(self.read_response()) != 'OK':
618 raise AuthenticationError('Invalid Password')
619
/usr/local/lib/python3.7/site-packages/redis/connection.py in read_response(self)
697 "Read the response from a previously sent command"
698 try:
--> 699 response = self._parser.read_response()
700 except socket.timeout:
701 self.disconnect()
/usr/local/lib/python3.7/site-packages/redis/connection.py in read_response(self)
307
308 def read_response(self):
--> 309 response = self._buffer.readline()
310 if not response:
311 raise ConnectionError(SERVER_CLOSED_CONNECTION_ERROR)
/usr/local/lib/python3.7/site-packages/redis/connection.py in readline(self)
239 while not data.endswith(SYM_CRLF):
240 # there's more data in the socket that we need
--> 241 self._read_from_socket()
242 buf.seek(self.bytes_read)
243 data = buf.readline()
/usr/local/lib/python3.7/site-packages/redis/connection.py in _read_from_socket(self, length, timeout, raise_on_timeout)
184 # an empty string indicates the server shutdown the socket
185 if isinstance(data, bytes) and len(data) == 0:
--> 186 raise ConnectionError(SERVER_CLOSED_CONNECTION_ERROR)
187 buf.write(data)
188 data_length = len(data)
This started happening after a move to hosted redis from a local instance
So, the problem is that the Redis connection string on those hosted solutions has to start with rediss:// as in redis + SSL as per the official documentation:
https://redislabs.com/lp/python-redis/
If you use hosted Redis from AWS or Digital Ocean this might as well happen to you :)
if you are using Celery you would also need to modify your app config in app.py as per
https://github.com/celery/celery/issues/5371
I tried to install spark on my windows 10 machine. I have anacondo2 with python 2.7. I managed to open the ipython notebook instance. I am able to run the following lines:
airlines=sc.textFile("airlines.csv")
print (airlines)
But I get an error when I run: airlines.first()
Here's the error I get:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-6-85a5d6f5110f> in <module>()
----> 1 airlines.first()
C:\spark\python\pyspark\rdd.py in first(self)
1326 ValueError: RDD is empty
1327 """
-> 1328 rs = self.take(1)
1329 if rs:
1330 return rs[0]
C:\spark\python\pyspark\rdd.py in take(self, num)
1308
1309 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1310 res = self.context.runJob(self, takeUpToNumLeft, p)
1311
1312 items += res
C:\spark\python\pyspark\context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
932 mappedRDD = rdd.mapPartitions(partitionFunc)
933 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
--> 934 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
935
936 def show_profiles(self):
C:\spark\python\pyspark\rdd.py in _load_from_socket(port, serializer)
137 break
138 if not sock:
--> 139 raise Exception("could not open socket")
140 try:
141 rf = sock.makefile("rb", 65536)
Exception: could not open socket
I get a different error when I execute: airlines.collect()
Here's the error:
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-5-3745b2fa985a> in <module>()
1 # Using the collect operation, you can view the full dataset
----> 2 airlines.collect()
C:\spark\python\pyspark\rdd.py in collect(self)
775 with SCCallSiteSync(self.context) as css:
776 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
--> 777 return list(_load_from_socket(port, self._jrdd_deserializer))
778
779 def reduce(self, f):
C:\spark\python\pyspark\rdd.py in _load_from_socket(port, serializer)
140 try:
141 rf = sock.makefile("rb", 65536)
--> 142 for item in serializer.load_stream(rf):
143 yield item
144 finally:
C:\spark\python\pyspark\serializers.py in load_stream(self, stream)
515 try:
516 while True:
--> 517 yield self.loads(stream)
518 except struct.error:
519 return
C:\spark\python\pyspark\serializers.py in loads(self, stream)
504
505 def loads(self, stream):
--> 506 length = read_int(stream)
507 if length == SpecialLengths.END_OF_DATA_SECTION:
508 raise EOFError
C:\spark\python\pyspark\serializers.py in read_int(stream)
541
542 def read_int(stream):
--> 543 length = stream.read(4)
544 if not length:
545 raise EOFError
C:\Users\AS\Anaconda2\lib\socket.pyc in read(self, size)
382 # fragmentation issues on many platforms.
383 try:
--> 384 data = self._sock.recv(left)
385 except error, e:
386 if e.args[0] == EINTR:
error: [Errno 10054] An existing connection was forcibly closed by the remote host
Please help.
INSTALL PYSPARK on Windows 10
JUPYTER-NOTEBOOK With ANACONDA NAVIGATOR
STEP 1
Download Packages
1) spark-2.2.0-bin-hadoop2.7.tgz Download
2) java jdk 8 version Download
3) Anaconda v 5.2 Download
4) scala-2.12.6.msi Download
5) hadoop v2.7.1Download
STEP 2
MAKE SPARK FOLDER IN C:/ DRIVE AND PUT EVERYTHING INSIDE IT
It will look like this
NOTE : DURING INSTALLATION OF SCALA GIVE PATH OF SCALA INSIDE SPARK FOLDER
STEP 3
NOW SET NEW WINDOWS ENVIRONMENT VARIABLES
HADOOP_HOME=C:\spark\hadoop
JAVA_HOME=C:\Program Files\Java\jdk1.8.0_151
SCALA_HOME=C:\spark\scala\bin
SPARK_HOME=C:\spark\spark\bin
PYSPARK_PYTHON=C:\Users\user\Anaconda3\python.exe
PYSPARK_DRIVER_PYTHON=C:\Users\user\Anaconda3\Scripts\jupyter.exe
PYSPARK_DRIVER_PYTHON_OPTS=notebook
NOW SELECT PATH OF SPARK : EDIT AND ADD NEW
Add "C:\spark\spark\bin” to variable “Path” Windows
STEP 4
Make folder where you want to store Jupyter-Notebook outputs and files
After that open Anaconda command prompt and cd Folder name
then enter Pyspark
thats it your browser will pop up with Juypter localhost
STEP 5
Check pyspark is working or not !
Type simple code and run it
from pyspark.sql import Row
a = Row(name = 'Vinay' , age=22 , height=165)
print("a: ",a)
I have been searching for a solution to this, but haven't found anything even remotely connected to it.
I am trying to read in a csv file using sc.textFile() in Spark. I am using version 1.5.1. When I read it in, I don't get an error, but when I try to do any function other than .count(), I get an error...for example .take(2). I can read in the file as a panda file and then change that into a Spark DataFrame and everything works out correctly with that. Here is the code and the error when I try to do rdd.take(2)
import pandas as pd
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
data_pd = pandas.read_csv('Data_Cortex_Nuclear.csv')
df = pd.DataFrame(data_pd)
df = sqlContext.createDataFrame(df)
rdd = sc.textFile('Data_Cortex_Nuclear.csv')
rdd.take(2) #Get error
df.take(2) #Get correct code
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-165-8a14f6c71652> in <module>()
8
9 rdd = sc.textFile('Data_Cortex_Nuclear.csv')
---> 10 rdd.take(2)
11 df.take(2)
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-168-8a14f6c71652> in <module>()
8
9 rdd = sc.textFile('Data_Cortex_Nuclear.csv')
---> 10 rdd.take(2)
11 df.take(2)
C:\Users\Anna\Downloads\spark-1.5.1-bin-hadoop2.6\spark-1.5.1-bin-hadoop2.6\python\pyspark\rdd.pyc in take(self, num)
1297
1298 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1299 res = self.context.runJob(self, takeUpToNumLeft, p)
1300
1301 items += res
C:\Users\Anna\Downloads\spark-1.5.1-bin-hadoop2.6\spark-1.5.1-bin-hadoop2.6\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc, partitions, allowLocal)
914 # SparkContext#runJob.
915 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 916 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
917 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
918
C:\Users\Anna\Downloads\spark-1.5.1-bin-hadoop2.6\spark-1.5.1-bin-hadoop2.6\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
C:\Users\Anna\Downloads\spark-1.5.1-bin-hadoop2.6\spark-1.5.1-bin-hadoop2.6\python\pyspark\sql\utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
C:\Users\Anna\Downloads\spark-1.5.1-bin-hadoop2.6\spark-1.5.1-bin-hadoop2.6\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 78.0 failed 1 times, most recent failure: Lost task 0.0 in stage 78.0 (TID 78, localhost): java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(Unknown Source)
at java.net.SocketOutputStream.write(Unknown Source)
at java.io.BufferedOutputStream.flushBuffer(Unknown Source)
at java.io.BufferedOutputStream.write(Unknown Source)
at java.io.DataOutputStream.write(Unknown Source)
at java.io.FilterOutputStream.write(Unknown Source)
at org.apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:622)
at org.apache.spark.api.python.PythonRDD$.org$apache$spark$api$python$PythonRDD$$write$1(PythonRDD.scala:442)
at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:452)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:280)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1699)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:239)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1822)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1835)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1848)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:393)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.GeneratedMethodAccessor74.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Unknown Source)
Caused by: java.net.SocketException: Connection reset by peer: socket write error
at java.net.SocketOutputStream.socketWrite0(Native Method)
at java.net.SocketOutputStream.socketWrite(Unknown Source)
at java.net.SocketOutputStream.write(Unknown Source)
at java.io.BufferedOutputStream.flushBuffer(Unknown Source)
at java.io.BufferedOutputStream.write(Unknown Source)
at java.io.DataOutputStream.write(Unknown Source)
at java.io.FilterOutputStream.write(Unknown Source)
at org.apache.spark.api.python.PythonRDD$.writeUTF(PythonRDD.scala:622)
at org.apache.spark.api.python.PythonRDD$.org$apache$spark$api$python$PythonRDD$$write$1(PythonRDD.scala:442)
at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRDD$$anonfun$writeIteratorToStream$1.apply(PythonRDD.scala:452)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:452)
at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:280)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1699)
at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:239)