Setting up pyspark on windows 10 - python-2.7

I tried to install spark on my windows 10 machine. I have anacondo2 with python 2.7. I managed to open the ipython notebook instance. I am able to run the following lines:
airlines=sc.textFile("airlines.csv")
print (airlines)
But I get an error when I run: airlines.first()
Here's the error I get:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-6-85a5d6f5110f> in <module>()
----> 1 airlines.first()
C:\spark\python\pyspark\rdd.py in first(self)
1326 ValueError: RDD is empty
1327 """
-> 1328 rs = self.take(1)
1329 if rs:
1330 return rs[0]
C:\spark\python\pyspark\rdd.py in take(self, num)
1308
1309 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1310 res = self.context.runJob(self, takeUpToNumLeft, p)
1311
1312 items += res
C:\spark\python\pyspark\context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal)
932 mappedRDD = rdd.mapPartitions(partitionFunc)
933 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
--> 934 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
935
936 def show_profiles(self):
C:\spark\python\pyspark\rdd.py in _load_from_socket(port, serializer)
137 break
138 if not sock:
--> 139 raise Exception("could not open socket")
140 try:
141 rf = sock.makefile("rb", 65536)
Exception: could not open socket
I get a different error when I execute: airlines.collect()
Here's the error:
---------------------------------------------------------------------------
error Traceback (most recent call last)
<ipython-input-5-3745b2fa985a> in <module>()
1 # Using the collect operation, you can view the full dataset
----> 2 airlines.collect()
C:\spark\python\pyspark\rdd.py in collect(self)
775 with SCCallSiteSync(self.context) as css:
776 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
--> 777 return list(_load_from_socket(port, self._jrdd_deserializer))
778
779 def reduce(self, f):
C:\spark\python\pyspark\rdd.py in _load_from_socket(port, serializer)
140 try:
141 rf = sock.makefile("rb", 65536)
--> 142 for item in serializer.load_stream(rf):
143 yield item
144 finally:
C:\spark\python\pyspark\serializers.py in load_stream(self, stream)
515 try:
516 while True:
--> 517 yield self.loads(stream)
518 except struct.error:
519 return
C:\spark\python\pyspark\serializers.py in loads(self, stream)
504
505 def loads(self, stream):
--> 506 length = read_int(stream)
507 if length == SpecialLengths.END_OF_DATA_SECTION:
508 raise EOFError
C:\spark\python\pyspark\serializers.py in read_int(stream)
541
542 def read_int(stream):
--> 543 length = stream.read(4)
544 if not length:
545 raise EOFError
C:\Users\AS\Anaconda2\lib\socket.pyc in read(self, size)
382 # fragmentation issues on many platforms.
383 try:
--> 384 data = self._sock.recv(left)
385 except error, e:
386 if e.args[0] == EINTR:
error: [Errno 10054] An existing connection was forcibly closed by the remote host
Please help.

INSTALL PYSPARK on Windows 10
JUPYTER-NOTEBOOK With ANACONDA NAVIGATOR
STEP 1
Download Packages
1) spark-2.2.0-bin-hadoop2.7.tgz Download
2) java jdk 8 version Download
3) Anaconda v 5.2 Download
4) scala-2.12.6.msi Download
5) hadoop v2.7.1Download
STEP 2
MAKE SPARK FOLDER IN C:/ DRIVE AND PUT EVERYTHING INSIDE IT
It will look like this
NOTE : DURING INSTALLATION OF SCALA GIVE PATH OF SCALA INSIDE SPARK FOLDER
STEP 3
NOW SET NEW WINDOWS ENVIRONMENT VARIABLES
HADOOP_HOME=C:\spark\hadoop
JAVA_HOME=C:\Program Files\Java\jdk1.8.0_151
SCALA_HOME=C:\spark\scala\bin
SPARK_HOME=C:\spark\spark\bin
PYSPARK_PYTHON=C:\Users\user\Anaconda3\python.exe
PYSPARK_DRIVER_PYTHON=C:\Users\user\Anaconda3\Scripts\jupyter.exe
PYSPARK_DRIVER_PYTHON_OPTS=notebook
NOW SELECT PATH OF SPARK : EDIT AND ADD NEW
Add "C:\spark\spark\bin” to variable “Path” Windows
STEP 4
Make folder where you want to store Jupyter-Notebook outputs and files
After that open Anaconda command prompt and cd Folder name
then enter Pyspark
thats it your browser will pop up with Juypter localhost
STEP 5
Check pyspark is working or not !
Type simple code and run it
from pyspark.sql import Row
a = Row(name = 'Vinay' , age=22 , height=165)
print("a: ",a)

Related

How to handle SSLError in Geocoding API?

I am using GCP's paid Geocoding API to get the latitude and longitude of around 1 million addresses. Though it is very slow but I can wait. The problem is I am getting SSLError, sometimes after 15000 requests, and sometimes after 6000 requests, or so. I am using Python 2.7 and I have tried to catch the error but it doesn't get resolved.
Error:
SSLErrorTraceback (most recent call last)
<ipython-input-2-7dd6c7aa3195> in <module>()
14 count=0
15 try:
---> 16 location2 = geocoder2.geocode(row,timeout=1,components={"country": "PK","locality":"Sindh"})
17 if location2:
18 print(count2,location2.latitude,location2.longitude)
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/geocoders/googlev3.pyc in geocode(self, query, exactly_one, timeout, bounds, region, components, place_id, language, sensor)
271 logger.debug("%s.geocode: %s", self.__class__.__name__, url)
272 return self._parse_json(
--> 273 self._call_geocoder(url, timeout=timeout), exactly_one
274 )
275
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/geocoders/base.pyc in _call_geocoder(self, url, timeout, raw, requester, deserializer, **kwargs)
398 return page
399
--> 400 page = decode_page(page)
401
402 if deserializer is not None:
/home/prassani/prassani/local/lib/python2.7/site-packages/geopy/util.pyc in decode_page(page)
50 else:
51 encoding = page.headers.getparam("charset") or "utf-8"
---> 52 return text_type(page.read(), encoding=encoding)
53 else: # requests?
54 encoding = page.headers.get("charset") or "utf-8"
/usr/lib/python2.7/socket.pyc in read(self, size)
353 while True:
354 try:
--> 355 data = self._sock.recv(rbufsize)
356 except error, e:
357 if e.args[0] == EINTR:
/usr/lib/python2.7/httplib.pyc in read(self, amt)
605 # connection, and the user is reading more bytes than will be provided
606 # (for example, reading in 1k chunks)
--> 607 s = self.fp.read(amt)
608 if not s and amt:
609 # Ideally, we would raise IncompleteRead if the content-length
/usr/lib/python2.7/socket.pyc in read(self, size)
382 # fragmentation issues on many platforms.
383 try:
--> 384 data = self._sock.recv(left)
385 except error, e:
386 if e.args[0] == EINTR:
/usr/lib/python2.7/ssl.pyc in recv(self, buflen, flags)
770 "non-zero flags not allowed in calls to recv() on %s" %
771 self.__class__)
--> 772 return self.read(buflen)
773 else:
774 return self._sock.recv(buflen, flags)
/usr/lib/python2.7/ssl.pyc in read(self, len, buffer)
657 v = self._sslobj.read(len, buffer)
658 else:
--> 659 v = self._sslobj.read(len)
660 return v
661 except SSLError as x:
SSLError: ('The read operation timed out',)
Code I am using:
from geopy.geocoders import GoogleV3
from geopy.exc import GeocoderServiceError, GeocoderTimedOut
from requests.exceptions import SSLError
import time
import pandas as pd
df=pd.read_csv("./sat-data.csv")
df['Address']=''
df['Location']=''
geocoder2 = GoogleV3(api_key="somekey")
count=0
count2=0
for index, row in df.School_Address.iteritems():
count+=1
count2+=1
if count ==4900:
time.sleep(100)
count=0
try:
location2 = geocoder2.geocode(row,timeout=1,components={"country": "PK","locality":"Sindh"})
if location2:
print(count2,location2.latitude,location2.longitude)
df.Address.loc[index]=location2.address
df.Location.loc[index]=(location2.latitude, location2.longitude)
else:
df.Address.loc[index]="None"
df.Location.loc[index]="None"
except (GeocoderServiceError, GeocoderTimedOut, SSLError), e:
print("Error: geocode failed on input")
df.to_csv('./sat-data-new.csv', index=False)

Pyredis cannot connect to Digital Ocean hosted Redis (connection lost ConnectionError exception)

In [16]: r
Out[16]: Redis<ConnectionPool<Connection<host=****,port=*****,db=3>>>
In [17]: r.set('zaza', 'king')
---------------------------------------------------------------------------
ConnectionError Traceback (most recent call last)
<ipython-input-17-8126d1846970> in <module>
----> 1 r.set('zaza', 'king')
/usr/local/lib/python3.7/site-packages/redis/client.py in set(self, name, value, ex, px, nx, xx)
1517 if xx:
1518 pieces.append('XX')
-> 1519 return self.execute_command('SET', *pieces)
1520
1521 def __setitem__(self, name, value):
/usr/local/lib/python3.7/site-packages/redis/client.py in execute_command(self, *args, **options)
834 pool = self.connection_pool
835 command_name = args[0]
--> 836 conn = self.connection or pool.get_connection(command_name, **options)
837 try:
838 conn.send_command(*args)
/usr/local/lib/python3.7/site-packages/redis/connection.py in get_connection(self, command_name, *keys, **options)
1069 try:
1070 # ensure this connection is connected to Redis
-> 1071 connection.connect()
1072 # connections that the pool provides should be ready to send
1073 # a command. if not, the connection was either returned to the
/usr/local/lib/python3.7/site-packages/redis/connection.py in connect(self)
545 self._sock = sock
546 try:
--> 547 self.on_connect()
548 except RedisError:
549 # clean up after any error in on_connect
/usr/local/lib/python3.7/site-packages/redis/connection.py in on_connect(self)
615 # to check the health prior to the AUTH
616 self.send_command('AUTH', self.password, check_health=False)
--> 617 if nativestr(self.read_response()) != 'OK':
618 raise AuthenticationError('Invalid Password')
619
/usr/local/lib/python3.7/site-packages/redis/connection.py in read_response(self)
697 "Read the response from a previously sent command"
698 try:
--> 699 response = self._parser.read_response()
700 except socket.timeout:
701 self.disconnect()
/usr/local/lib/python3.7/site-packages/redis/connection.py in read_response(self)
307
308 def read_response(self):
--> 309 response = self._buffer.readline()
310 if not response:
311 raise ConnectionError(SERVER_CLOSED_CONNECTION_ERROR)
/usr/local/lib/python3.7/site-packages/redis/connection.py in readline(self)
239 while not data.endswith(SYM_CRLF):
240 # there's more data in the socket that we need
--> 241 self._read_from_socket()
242 buf.seek(self.bytes_read)
243 data = buf.readline()
/usr/local/lib/python3.7/site-packages/redis/connection.py in _read_from_socket(self, length, timeout, raise_on_timeout)
184 # an empty string indicates the server shutdown the socket
185 if isinstance(data, bytes) and len(data) == 0:
--> 186 raise ConnectionError(SERVER_CLOSED_CONNECTION_ERROR)
187 buf.write(data)
188 data_length = len(data)
This started happening after a move to hosted redis from a local instance
So, the problem is that the Redis connection string on those hosted solutions has to start with rediss:// as in redis + SSL as per the official documentation:
https://redislabs.com/lp/python-redis/
If you use hosted Redis from AWS or Digital Ocean this might as well happen to you :)
if you are using Celery you would also need to modify your app config in app.py as per
https://github.com/celery/celery/issues/5371

RuntimeError: latex was not able to process the following string: '_auto'

To whom will concern my question,
I met a problem to perform the python code built by LIGO team on the jupyter notebook. When I tried to perform the following simple code without the environment of the jupyter notebook, I can obtain the correct output and the attached figure.
+++++++++++++++++++++++++++++++++
from gwpy.timeseries import TimeSeries
from numpy import random
series = TimeSeries(random.random(1000), sample_rate=100, unit='m')
plot = series.plot()
plot.show()
+++++++++++++++++++++++++++
Generation of random plots
But when I applied the following similar python code in the jupyter notebook:
++++++++++++++++
`%matplotlib inline`
%config InlineBackend.figure_format = 'retina'
from gwpy.timeseries import TimeSeries
from numpy import random
series = TimeSeries(random.random(1000), sample_rate=100, unit='m')
plot = series.plot()
++++++++++++++++
I obtained the runtime error related to the latex.
Since I can get the correct output if I do not use the jupyter, I speculate that the problem is caused by the transformation of the plot to present on the notebook environment.
I tried to check the related runtime error related to latex and install the packages (e.g., dvipng, texlive-latex-extra & texlive-fonts-recommended) through Macports or re-install MacTeX in my machine, but the problem still exists.
Here comes the warning & error messages I obtained on the jupyter notebook.
RuntimeError Traceback (most recent call last)
/Users/lupin/Library/Python/2.7/lib/python/site-packages/IPython/core/formatters.pyc in __call__(self, obj)
332 pass
333 else:
--> 334 return printer(obj)
335 # Finally look for special method names
336 method = get_real_method(obj, self.print_method)
/Users/lupin/Library/Python/2.7/lib/python/site-packages/IPython/core/pylabtools.pyc in <lambda>(fig)
247 png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))
248 if 'retina' in formats or 'png2x' in formats:
--> 249 png_formatter.for_type(Figure, lambda fig: retina_figure(fig, **kwargs))
250 if 'jpg' in formats or 'jpeg' in formats:
251 jpg_formatter.for_type(Figure, lambda fig: print_figure(fig, 'jpg', **kwargs))
/Users/lupin/Library/Python/2.7/lib/python/site-packages/IPython/core/pylabtools.pyc in retina_figure(fig, **kwargs)
137 def retina_figure(fig, **kwargs):
138 """format a figure as a pixel-doubled (retina) PNG"""
--> 139 pngdata = print_figure(fig, fmt='retina', **kwargs)
140 # Make sure that retina_figure acts just like print_figure and returns
141 # None when the figure is empty.
/Users/lupin/Library/Python/2.7/lib/python/site-packages/IPython/core/pylabtools.pyc in print_figure(fig, fmt, bbox_inches, **kwargs)
129
130 bytes_io = BytesIO()
--> 131 fig.canvas.print_figure(bytes_io, **kw)
132 data = bytes_io.getvalue()
133 if fmt == 'svg':
/Library/Python/2.7/site-packages/matplotlib/backend_bases.pyc in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, **kwargs)
2212 **kwargs)
2213 renderer = self.figure._cachedRenderer
-> 2214 bbox_inches = self.figure.get_tightbbox(renderer)
2215
2216 bbox_artists = kwargs.pop("bbox_extra_artists", None)
/Library/Python/2.7/site-packages/matplotlib/figure.pyc in get_tightbbox(self, renderer)
2188 for ax in self.axes:
2189 if ax.get_visible():
-> 2190 bb.append(ax.get_tightbbox(renderer))
2191
2192 if len(bb) == 0:
/Library/Python/2.7/site-packages/matplotlib/axes/_base.pyc in get_tightbbox(self, renderer, call_axes_locator)
4168 bb.append(self._right_title.get_window_extent(renderer))
4169
-> 4170 bb_xaxis = self.xaxis.get_tightbbox(renderer)
4171 if bb_xaxis:
4172 bb.append(bb_xaxis)
/Library/Python/2.7/site-packages/matplotlib/axis.pyc in get_tightbbox(self, renderer)
1158 for a in [self.label, self.offsetText]:
1159 if a.get_visible():
-> 1160 bb.append(a.get_window_extent(renderer))
1161
1162 bb.extend(ticklabelBoxes)
/Library/Python/2.7/site-packages/matplotlib/text.pyc in get_window_extent(self, renderer, dpi)
920 raise RuntimeError('Cannot get window extent w/o renderer')
921
--> 922 bbox, info, descent = self._get_layout(self._renderer)
923 x, y = self.get_unitless_position()
924 x, y = self.get_transform().transform_point((x, y))
/Library/Python/2.7/site-packages/matplotlib/text.pyc in _get_layout(self, renderer)
307 w, h, d = renderer.get_text_width_height_descent(clean_line,
308
self._fontproperties,
--> 309 ismath=ismath)
310 else:
311 w, h, d = 0, 0, 0
/Library/Python/2.7/site-packages/matplotlib/backends/backend_agg.pyc in get_text_width_height_descent(self, s, prop, ismath)
230 fontsize = prop.get_size_in_points()
231 w, h, d = texmanager.get_text_width_height_descent(
--> 232 s, fontsize, renderer=self)
233 return w, h, d
234
/Library/Python/2.7/site-packages/matplotlib/texmanager.pyc in get_text_width_height_descent(self, tex, fontsize, renderer)
499 else:
500 # use dviread. It sometimes returns a wrong descent.
--> 501 dvifile = self.make_dvi(tex, fontsize)
502 with dviread.Dvi(dvifile, 72 * dpi_fraction) as dvi:
503 page = next(iter(dvi))
/Library/Python/2.7/site-packages/matplotlib/texmanager.pyc in make_dvi(self, tex, fontsize)
363 self._run_checked_subprocess(
364 ["latex", "-interaction=nonstopmode", "--halt-on-error",
--> 365 texfile], tex)
366 for fname in glob.glob(basefile + '*'):
367 if not fname.endswith(('dvi', 'tex')):
/Library/Python/2.7/site-packages/matplotlib/texmanager.pyc in _run_checked_subprocess(self, command, tex)
342 prog=command[0],
343 tex=tex.encode('unicode_escape'),
--> 344 exc=exc.output.decode('utf-8')))
345 _log.debug(report)
346 return report
RuntimeError: latex was not able to process the following string:
'_auto'
Here is the full report generated by latex:
This is pdfTeX, Version 3.14159265-2.6-1.40.18 (TeX Live 2017/MacPorts 2017_4) (preloaded format=latex)
restricted \write18 enabled.
entering extended mode
(/Users/lupin/.matplotlib/tex.cache/1de80ee53f095837776b678f34112ba4.tex
LaTeX2e <2017-04-15>
Babel <3.10> and hyphenation patterns for 3 language(s) loaded.
(/opt/local/share/texmf-texlive/tex/latex/base/article.cls
Document Class: article 2014/09/29 v1.4h Standard LaTeX document class
(/opt/local/share/texmf-texlive/tex/latex/base/size10.clo))
(/opt/local/share/texmf-texlive/tex/latex/type1cm/type1cm.sty)
(/opt/local/share/texmf-texlive/tex/latex/base/textcomp.sty
(/opt/local/share/texmf-texlive/tex/latex/base/ts1enc.def))
(/opt/local/share/texmf-texlive/tex/latex/geometry/geometry.sty
(/opt/local/share/texmf-texlive/tex/latex/graphics/keyval.sty)
(/opt/local/share/texmf-texlive/tex/generic/oberdiek/ifpdf.sty)
(/opt/local/share/texmf-texlive/tex/generic/oberdiek/ifvtex.sty)
(/opt/local/share/texmf-texlive/tex/generic/ifxetex/ifxetex.sty)
Package geometry Warning: Over-specification in `h'-direction.
`width' (5058.9pt) is ignored.
Package geometry Warning: Over-specification in `v'-direction.
`height' (5058.9pt) is ignored.
) (./1de80ee53f095837776b678f34112ba4.aux)
(/opt/local/share/texmf-texlive/tex/latex/base/ts1cmr.fd)
*geometry* driver: auto-detecting
*geometry* detected driver: dvips
! Missing $ inserted.
<inserted text>
$
l.13 \fontsize{20.000000}{25.000000}{\rmfamily _
auto}
No pages of output.
Transcript written on 1de80ee53f095837776b678f34112ba4.log.
Can anyone provide some suggestions to solve this issue?
The '_' legend in latex indicates subscript, translator can't identify it out of '$$', replacing '_auto' with '-auto' may help:)

H2O machine learning platform for Python incurs EnvironmentError while building models

I am new to h2o machine learning platform and having the below issue while trying to build models.
When i was trying to build 5 GBM models with a not so large dataset, it has the following error:
gbm Model Build Progress: [##################################################] 100%
gbm Model Build Progress: [##################################################] 100%
gbm Model Build Progress: [##################################################] 100%
gbm Model Build Progress: [##################################################] 100%
gbm Model Build Progress: [################# ] 34%
EnvironmentErrorTraceback (most recent call last)
<ipython-input-22-e74b34df2f1a> in <module>()
13 params_model={'x': features_pca_all, 'y': response, 'training_frame': train_holdout_pca_hex, 'validation_frame': validation_holdout_pca_hex, 'ntrees': ntree, 'max_depth':depth, 'min_rows': min_rows, 'learn_rate': 0.005}
14
---> 15 gbm_model=h2o.gbm(**params_model)
16
17 #store model
C:\Anaconda2\lib\site-packages\h2o\h2o.pyc in gbm(x, y, validation_x, validation_y, training_frame, model_id, distribution, tweedie_power, ntrees, max_depth, min_rows, learn_rate, nbins, nbins_cats, validation_frame, balance_classes, max_after_balance_size, seed, build_tree_one_node, nfolds, fold_column, fold_assignment, keep_cross_validation_predictions, score_each_iteration, offset_column, weights_column, do_future, checkpoint)
1058 parms = {k:v for k,v in locals().items() if k in ["training_frame", "validation_frame", "validation_x", "validation_y", "offset_column", "weights_column", "fold_column"] or v is not None}
1059 parms["algo"]="gbm"
-> 1060 return h2o_model_builder.supervised(parms)
1061
1062
C:\Anaconda2\lib\site-packages\h2o\h2o_model_builder.pyc in supervised(kwargs)
28 algo = kwargs["algo"]
29 parms={k:v for k,v in kwargs.items() if (k not in ["x","y","validation_x","validation_y","algo"] and v is not None) or k=="validation_frame"}
---> 30 return supervised_model_build(x,y,vx,vy,algo,offsets,weights,fold_column,parms)
31
32 def unsupervised_model_build(x,validation_x,algo_url,kwargs): return _model_build(x,None,validation_x,None,algo_url,None,None,None,kwargs)
C:\Anaconda2\lib\site-packages\h2o\h2o_model_builder.pyc in supervised_model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs)
16 if not is_auto_encoder and y is None: raise ValueError("Missing response")
17 if vx is not None and vy is None: raise ValueError("Missing response validating a supervised model")
---> 18 return _model_build(x,y,vx,vy,algo,offsets,weights,fold_column,kwargs)
19
20 def supervised(kwargs):
C:\Anaconda2\lib\site-packages\h2o\h2o_model_builder.pyc in _model_build(x, y, vx, vy, algo, offsets, weights, fold_column, kwargs)
86 do_future = kwargs.pop("do_future") if "do_future" in kwargs else False
87 future_model = H2OModelFuture(H2OJob(H2OConnection.post_json("ModelBuilders/"+algo, **kwargs), job_type=(algo+" Model Build")), x)
---> 88 return future_model if do_future else _resolve_model(future_model, **kwargs)
89
90 def _resolve_model(future_model, **kwargs):
C:\Anaconda2\lib\site-packages\h2o\h2o_model_builder.pyc in _resolve_model(future_model, **kwargs)
89
90 def _resolve_model(future_model, **kwargs):
---> 91 future_model.poll()
92 if '_rest_version' in kwargs.keys(): model_json = H2OConnection.get_json("Models/"+future_model.job.dest_key, _rest_version=kwargs['_rest_version'])["models"][0]
93 else: model_json = H2OConnection.get_json("Models/"+future_model.job.dest_key)["models"][0]
C:\Anaconda2\lib\site-packages\h2o\model\model_future.pyc in poll(self)
8
9 def poll(self):
---> 10 self.job.poll()
11 self.x = None
C:\Anaconda2\lib\site-packages\h2o\job.pyc in poll(self)
39 time.sleep(sleep)
40 if sleep < 1.0: sleep += 0.1
---> 41 self._refresh_job_view()
42 running = self._is_running()
43 self._update_progress()
C:\Anaconda2\lib\site-packages\h2o\job.pyc in _refresh_job_view(self)
52
53 def _refresh_job_view(self):
---> 54 jobs = H2OConnection.get_json(url_suffix="Jobs/" + self.job_key)
55 self.job = jobs["jobs"][0] if "jobs" in jobs else jobs["job"][0]
56 self.status = self.job["status"]
C:\Anaconda2\lib\site-packages\h2o\connection.pyc in get_json(url_suffix, **kwargs)
410 if __H2OCONN__ is None:
411 raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 412 return __H2OCONN__._rest_json(url_suffix, "GET", None, **kwargs)
413
414 #staticmethod
C:\Anaconda2\lib\site-packages\h2o\connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
419
420 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 421 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
422 return self._process_tables(raw_txt.json())
423
C:\Anaconda2\lib\site-packages\h2o\connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
476
477 begin_time_seconds = time.time()
--> 478 http_result = self._attempt_rest(url, method, post_body, file_upload_info)
479 end_time_seconds = time.time()
480 elapsed_time_seconds = end_time_seconds - begin_time_seconds
C:\Anaconda2\lib\site-packages\h2o\connection.pyc in _attempt_rest(self, url, method, post_body, file_upload_info)
526
527 except requests.ConnectionError as e:
--> 528 raise EnvironmentError("h2o-py encountered an unexpected HTTP error:\n {}".format(e))
529
530 return http_result
EnvironmentError: h2o-py encountered an unexpected HTTP error:
('Connection aborted.', BadStatusLine("''",))
My hunch is that the cluster memory has only around 247.5 MB which is not enough to handle the model building hence aborted the connection to h2o. Here are the codes I used to initiate h2o:
#initialization of h2o module
import subprocess as sp
import sys
import os.path as p
# path of h2o jar file
h2o_path = p.join(sys.prefix, "h2o_jar", "h2o.jar")
# subprocess to launch h2o
# the command can be further modified to include virtual machine parameters
sp.Popen("java -jar " + h2o_path)
# h2o.init() call to verify that h2o launch is successfull
h2o.init(ip="localhost", port=54321, size=1, start_h2o=False, enable_assertions=False, \
license=None, max_mem_size_GB=4, min_mem_size_GB=4, ice_root=None)
and here is the returned status table:
Any ideas on the above would be greatly appreciated!!
Just to close out this question, I'll restate the solution mentioned in the comments above. The user was able to resolve the issue by starting H2O from the command line with 1GB of memory using java -jar -Xmx1g h2o.jar, and then connected to the existing H2O server in Python using h2o.init().
It's not clear to me why h2o.init() was not creating the correct size cluster using the max_mem_size_GB argument. Regardless, this argument has been deprecated recently and replaced by another argument, max_mem_size, so it may no longer be an issue.

happy base integration not working with hbase

I am able to connect with my hbase
connection = happybase.Connection(host='node-04',port=16000)
table = connection.table('test')
These 2 commands work without any error. but when I run the below cammand i am getting following error
print connection.tables()
error
Traceback (most recent call last)
<ipython-input-49-de0848d7286f> in <module>()
----> 1 print connection.tables()
/root/anaconda2/lib/python2.7/site-packages/happybase/connection.pyc in tables(self)
236 :rtype: List of strings
237 """
--> 238 names = self.client.getTableNames()
239
240 # Filter using prefix, and strip prefix from names
/root/anaconda2/lib/python2.7/site-packages/happybase/hbase/Hbase.pyc in getTableNames(self)
815 #return returns a list of names
816 """
--> 817 self.send_getTableNames()
818 return self.recv_getTableNames()
819
/root/anaconda2/lib/python2.7/site-packages/happybase/hbase/Hbase.pyc in send_getTableNames(self)
823 args.write(self._oprot)
824 self._oprot.writeMessageEnd()
--> 825 self._oprot.trans.flush()
826
827 def recv_getTableNames(self, ):
/root/anaconda2/lib/python2.7/site-packages/thrift/transport/TTransport.pyc in flush(self)
172 # reset wbuf before write/flush to preserve state on underlying failure
173 self.__wbuf = StringIO()
--> 174 self.__trans.write(out)
175 self.__trans.flush()
176
/root/anaconda2/lib/python2.7/site-packages/thrift/transport/TSocket.pyc in write(self, buff)
128 have = len(buff)
129 while sent < have:
--> 130 plus = self.handle.send(buff)
131 if plus == 0:
132 raise TTransportException(type=TTransportException.END_OF_FILE,
error: [Errno 32] Broken pipe
I am usingHbase version:1.1.2.2.3.4.0-3485
Please help if you can suggest any package which i can use to code for hbase using python
happybase requires you to connect to the thrift daemon, which you need to start on your hbase cluster. happybase does not connect to hbase nodes directly.
judging from the port number, you are not connecting to thrift (uses port 9090 by default) but to the hbase master. this is not how happybase works.