Heroku MemCachier will not store cache - flask

Quite new to this as I am trying to deploy MemCachier and I am quite new to code in general.
Followed this guide
https://devcenter.heroku.com/articles/flask-memcache
I have a function in a Flask app that will store a df in cache. I am working localy and didn't deploy the changes yet. The app is working fine and I can access the df in my app, but MemCachier GUI on Heroku does not display any data stored, so I am assuming its using cache.init_app(app, config={'CACHE_TYPE': 'simple'})
My code + function is:
#set memcache in Heroku
cache_servers = os.environ.get('MEMCACHIER_SERVERS')
if cache_servers == None:
cache.init_app(app, config={'CACHE_TYPE': 'simple'})
else:
cache_user = os.environ.get('MEMCACHIER_USERNAME') or ''
cache_pass = os.environ.get('MEMCACHIER_PASSWORD') or ''
cache.init_app(app,
config={'CACHE_TYPE': 'saslmemcached',
'CACHE_MEMCACHED_SERVERS': cache_servers.split(','),
'CACHE_MEMCACHED_USERNAME': cache_user,
'CACHE_MEMCACHED_PASSWORD': cache_pass,
'CACHE_OPTIONS': { 'behaviors': {
# Faster IO
'tcp_nodelay': True,
# Keep connection alive
'tcp_keepalive': True,
# Timeout for set/get requests
'connect_timeout': 2000, # ms
'send_timeout': 750 * 1000, # us
'receive_timeout': 750 * 1000, # us
'_poll_timeout': 2000, # ms
# Better failover
'ketama': True,
'remove_failed': 1,
'retry_timeout': 2,
'dead_timeout': 30}}})
def symbol_search():
flo = BytesIO()
directory = 'symboldirectory'
filenames = ('otherlisted.txt', 'nasdaqlisted.txt')
ftp = FTP('ftp.nasdaqtrader.com')
ftp.login()
ftp.cwd(directory)
#Create pandas dataframes from the nasdaqlisted and otherlisted files.
for item in filenames:
nasdaq_exchange_info=[]
ftp.retrbinary('RETR ' + item, flo.write)
flo.seek(0)
nasdaq_exchange_info.append(pd.read_fwf(flo))
ftp.quit()
# Create pandas dataframes from the nasdaqlisted and otherlisted files.
nasdaq_exchange_info=pd.concat(nasdaq_exchange_info, axis=1)
nasdaq_exchange_info[['symbol', 'name', 'Exchange', 'Symbol', 'etf', 'Lot_size', 'Test', 'NASDAQ_Symbol']]=nasdaq_exchange_info['ACT Symbol|Security Name|Exchange|CQS Symbol|ETF|Round Lot Size|Test Issue|NASDAQ Symbol'].str.split('|', expand=True)
nasdaq_exchange_info=nasdaq_exchange_info.drop(nasdaq_exchange_info.columns[[0]], axis=1).dropna()
nasdaq_exchange_info=nasdaq_exchange_info[(nasdaq_exchange_info['Test'] != 'Y') & (nasdaq_exchange_info['symbol'] != 'Y') & (~nasdaq_exchange_info.symbol.str.contains('symbol', 'file')) & (~nasdaq_exchange_info.name.str.contains('%', 'arrant'))]
nasdaq_exchange_info=nasdaq_exchange_info.drop(['Symbol', 'Exchange', 'Lot_size', 'Test', 'NASDAQ_Symbol', 'etf'], axis = 1)
nasdaq_exchange_info=nasdaq_exchange_info[['name', 'symbol']].values.tolist()
return cache.set("nasdaq_exchange_info", nasdaq_exchange_info)
symbol_search()
What I am missing here? and how can I upload the cache to MemCachier that it will be visible in the GUI?

Related

How to deploy the Deep learning model(computer vision ) in aws using lambda

I have trained in background removal with my custom images and I am new to deploying computer vision models. please, anyone, share the blog how to deploy the Pytorch deep learning model(computer vision) using lambda.
I have written some lambda functions to take the input image and predict the segment and give output as a background removal image.
I am not sure this function is correct or not. please check this function as well.
Define imports
try:
import unzip_requirements
except ImportError:
pass
import json
from io import BytesIO
import time
import os
import base64
import boto3
import numpy as np
from skimage import io
import matplotlib.pyplot as plt
from preprocessing import RescaleT, ToTensorLab
import torch
import numpy as np
from PIL import Image
from network.u2net import U2NET
# Define two functions inside handler.py: img_to_base64_str to
# convert binary images to base64 format and load_models to
# load the four pretrained model inside a dictionary and then
# keep them in memory
def img_to_base64_str(img):
buffered = BytesIO()
img.save(buffered, format="PNG")
buffered.seek(0)
img_byte = buffered.getvalue()
img_str = "data:image/png;base64," + base64.b64encode(img_byte).decode()
return img_str
def load_models(s3, bucket):
model = U2NET(3,1)
response = s3.get_object(
Bucket=bucket, Key=f"models/u2net/u2net.pth")
state = torch.load(BytesIO(response["Body"].read()),map_location=torch.device('cpu'))
model.load_state_dict(state)
model.eval()
return model
def preprocess_raw_img(raw_img_array):
"""
This function preprocesses a raw input array in a way such that it can be fed into the U-2-Net architecture
:param raw_img_array:
:return:
"""
rescaler = RescaleT(320)
rescaled_img = rescaler(raw_img_array)
tensor_converter = ToTensorLab(flag=0)
tensor_img = tensor_converter(rescaled_img)
tensor_img = tensor_img.unsqueeze(0)
return tensor_img
def normPRED(d):
ma = torch.max(d)
mi = torch.min(d)
dn = (d-mi)/(ma-mi)
return dn
def resize_img_to_orig(prediction_np, orig_img):
image = Image.fromarray(prediction_np * 255).convert('RGB')
image_original = image.resize((orig_img.shape[1], orig_img.shape[0]), resample=Image.BILINEAR)
return image_original
def mask_to_orig_size(orig_img, rescale, threshold):
mask_orig_size = np.array(orig_img, dtype=np.float64)
mask_orig_size /= rescale
mask_orig_size[mask_orig_size > threshold] = 1
mask_orig_size[mask_orig_size <= threshold] = 0
return mask_orig_size
def extract_foreground(mask_orig_size):
shape = mask_orig_size.shape
a_layer_init = np.ones(shape=(shape[0], shape[1], 1))
mul_layer = np.expand_dims(mask_orig_size[:, :, 0], axis=2)
a_layer = mul_layer * a_layer_init
rgba_out = np.append(mask_orig_size, a_layer, axis=2)
return rgba_out
def input_to_rgba_inp(input_arr, rescale):
input_arr = np.array(input_arr, dtype=np.float64)
shape = input_arr.shape
input_arr /= rescale
a_layer = np.ones(shape=(shape[0], shape[1], 1))
rgba_inp = np.append(input_arr, a_layer, axis=2)
return rgba_inp
def u2net_api_call(raw_img_array, model):
"""
This function takes as input an image array of any size. The goal is to return only the object in the foreground of
the image.
Therefore, the raw input image is preprocessed, fed into the deep learning model. Afterwards the foreground of the
original image is extracted from the mask which was generated by the deep learning model.
"""
THRESHOLD = 0.9
RESCALE = 255
preprocessed_img = preprocess_raw_img(raw_img_array)
d1, d2, d3, d4, d5, d6, d7 = model(preprocessed_img)
prediction = d1[:, 0, :, :]
prediction = normPRED(prediction)
prediction_np = prediction.squeeze().cpu().data.numpy()
img_orig_size = resize_img_to_orig(prediction_np, raw_img_array)
mask_orig_size = mask_to_orig_size(img_orig_size, RESCALE, THRESHOLD)
rgba_out = extract_foreground(mask_orig_size)
rgba_inp = input_to_rgba_inp(raw_img_array, RESCALE)
rem_back = (rgba_inp * rgba_out)
return rem_back
s3 = boto3.client("s3")
bucket = "sagemaker-m-model"
model = load_models(s3, bucket)
def lambda_handler(event,Context):
if event.get("source") in ["aws.events", "serverless-plugin-warmup"]:
print('Lambda is warm!')
return {}
data = json.loads(event["body"])
print("data keys :", data.keys())
image = data["image"]
image = image[image.find(",")+1:]
dec = base64.b64decode(image + "===")
image = Image.open(io.BytesIO(dec))
#image = image.convert("RGB")
# loading the model with the selected style based on the model_id payload
model = model
# resize the image based on the load_size payload
#load_size = int(data["load_size"])
with torch.no_grad():
background_removed = u2net_api_call(image, model)
output_image = background_removed[0]
# deprocess, (0, 1)
output_image = output_image.data.cpu().float() * 0.5 + 0.5
output_image = output_image.numpy()
output_image = np.uint8(output_image.transpose(1, 2, 0) * 255)
output_image = Image.fromarray(background_removed)
# convert the PIL image to base64
result = {
"output": img_to_base64_str(output_image)
}
# send the result back to the client inside the body field
return {
"statusCode": 200,
"body": json.dumps(result),
"headers": {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*'
}
}
I have tried with the Serverless framework, I got some errors. I understand how to solve this.
Running "serverless" from node_modules
Warning: Invalid configuration encountered
at 'custom.warmup.events': must be object
at 'custom.warmup.timeout': must be object
at 'functions.transformImage.warmup': must be object
Learn more about configuration validation here: http://slss.io/configuration-validation
Deploying br to stage dev (us-east-1)
Warning: WarmUp: Skipping warmer "events" creation. No functions to warm up.
Warning: WarmUp: Skipping warmer "timeout" creation. No functions to warm up.
✖ Stack br-dev failed to deploy (11s)
Environment: linux, node 16.14.0, framework 3.4.0 (local) 3.4.0v (global), plugin 6.1.2, SDK 4.3.1
Docs: docs.serverless.com
Support: forum.serverless.com
Bugs: github.com/serverless/serverless/issues
Error:
Error: `docker run --rm -v /home/suri/project1/rmbg/br/cache/cf58e2124c894818b4beab8df9ac26ac92eeb326c8c74fc7e60e8f08ea86df1e_x86_64_slspyc:/var/task:z -v /home/suri/project1/rmbg/br/cache/downloadCacheslspyc:/var/useDownloadCache:z lambci/lambda:build-python3.6 /bin/sh -c chown -R 0\:0 /var/useDownloadCache && python3.6 -m pip install -t /var/task/ -r /var/task/requirements.txt --cache-dir /var/useDownloadCache && chown -R 0\:0 /var/task && chown -R 0\:0 /var/useDownloadCache` Exited with code 1
at ChildProcess.<anonymous> (/home/suri/project1/rmbg/br/node_modules/child-process-ext/spawn.js:38:8)
at ChildProcess.emit (node:events:520:28)
at ChildProcess.emit (node:domain:475:12)
at maybeClose (node:internal/child_process:1092:16)
at Process.ChildProcess._handle.onexit (node:internal/child_process:302:5)
3 deprecations found: run 'serverless doctor' for more details

How do I create and append data from csv file to big query and partition the table using python?

I have compressed csv gzip files in Google Cloud Storage and using Python, I am auto detecting the schema and creating a new table in Google BigQuery depending on the naming convention. How do I partition the table being created? I already have a Date column in the data that I would like to use.
# importing libraries
from google.cloud import bigquery
# defining first load list
first_load_list = []
#defining tracker file
tracker_file = open("tracker_file", "a")
#reading values from config file
config_file = open("ingestion.config", "r")
for line in config_file:
if "project_id" in line:
project_id = line.split("=")[1].strip()
elif "dataset" in line:
dataset = line.split("=")[1].strip()
elif "gcs_location" in line:
gcs_location = line.split("=")[1].strip()
elif "bq1_target_table" in line:
bq1_target_table = line.split("=")[1].strip()
elif "bq2_target_table" in line:
bq2_target_table = line.split("=")[1].strip()
elif "bq1_first_load_filename" in line:
bq1_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq1_first_load_filename)
elif "bq2_first_load_filename" in line:
bq2_first_load_filename = line.split("=")[1].strip()
first_load_list.append(bq2_first_load_filename)
elif "gcs_bucket" in line:
gcs_bucket = line.split("=")[1].strip()
# reading bucket list temp file
bucket_list_file = open("bucket_list.temp", "r")
bucket_list = []
for entry in bucket_list_file:
bucket_list.append(entry)
# defining client and specifying project
client = bigquery.Client(project_id)
dataset_id = dataset
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.skip_leading_rows = 1
job_config.source_format = bigquery.SourceFormat.CSV
# leading files into tables based on naming convention
for filename in first_load_list:
if "BQ2_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq2_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq2_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
elif "BQ1_2" in filename:
uri = gcs_location + filename
print "Processing file = " + uri
load_job = client.load_table_from_uri(
uri.strip(),
dataset_ref.table(bq1_target_table),
job_config=job_config) # API request
assert load_job.job_type == 'load'
load_job.result() # Waits for table load to complete.
assert load_job.state == 'DONE'
assert client.get_table(dataset_ref.table(bq1_target_table))
tracker_file.write(filename + "\n")
print filename.strip() + " processing complete\n"
tracker_file.close()
This is the code that I run for first load. Once the first load tables are created, I then only want to append data to these tables going forward. I looked at https://cloud.google.com/bigquery/docs/creating-partitioned-tables but I can't figure out how to implement in Python.
Can anyone help to point me in the right direction please?
You can use job_config._properties['load']['timePartitioning'] = {"type":"DAY", 'field':'your_field'} to create a partition table on load. I just tested it on my end with test data and it worked as expected.
Please note that partition with the API only supports 'DAY' for now.
See GitHub issue

Is there any faster way for downloading multiple files from s3 to local folder?

I am trying to download 12,000 files from s3 bucket using jupyter notebook, which is estimating to complete download in 21 hours. This is because each file is downloaded one at a time. Can we do multiple downloads parallel to each other so I can speed up the process?
Currently, I am using the following code to download all files
### Get unique full-resolution image basenames
images = df['full_resolution_image_basename'].unique()
print(f'No. of unique full-resolution images: {len(images)}')
### Create a folder for full-resolution images
images_dir = './images/'
os.makedirs(images_dir, exist_ok=True)
### Download images
images_str = "','".join(images)
limiting_clause = f"CONTAINS(ARRAY['{images_str}'],
full_resolution_image_basename)"
_ = download_full_resolution_images(images_dir,
limiting_clause=limiting_clause)
See the code below. This will only work with python 3.6+, because of the f-string (PEP 498). Use a different method of string formatting for older versions of python.
Provide the relative_path, bucket_name and s3_object_keys. In addition, max_workers is optional, and if not provided the number will be a multiple of 5 times the number of machine processors.
Most of the code for this answer came from an answer to How to create an async generator in Python?
which sources from this example documented in the library.
import boto3
import os
from concurrent import futures
relative_path = './images'
bucket_name = 'bucket_name'
s3_object_keys = [] # List of S3 object keys
max_workers = 5
abs_path = os.path.abspath(relative_path)
s3 = boto3.client('s3')
def fetch(key):
file = f'{abs_path}/{key}'
os.makedirs(file, exist_ok=True)
with open(file, 'wb') as data:
s3.download_fileobj(bucket_name, key, data)
return file
def fetch_all(keys):
with futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_key = {executor.submit(fetch, key): key for key in keys}
print("All URLs submitted.")
for future in futures.as_completed(future_to_key):
key = future_to_key[future]
exception = future.exception()
if not exception:
yield key, future.result()
else:
yield key, exception
for key, result in fetch_all(S3_OBJECT_KEYS):
print(f'key: {key} result: {result}')
Thank you for this. Had 9000 over JPEG images that I needed to download from my S3. I tried to incorporate this directly into my Colab Pro but wasn't able to get it to work. Kept getting "Errno 21 : Is a directory" error.
Had to add 2 things: 1) a makedir to create the directory I want & 2) use mknod, instead of mkdir.
fetch_all is almost the same: except a small edit for max_workers to actually take effect. s3c is just my boto3.client with my keys and all.
My download time went from 30+ mins to 5 mins with 1000 workers.
os.makedirs('/*some dir you want*/*prefix*')
def fetch(key):
file = f'{abs_path}/{key}'
os.mknod(file, mode=384)
with open(file, 'wb') as data:
s3c.download_fileobj(bucket_name, key, data)
return file
def fetch_all(keys):
with futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_key = {executor.submit(fetch, key): key for key in keys}
print("All URLs submitted.")
for future in futures.as_completed(future_to_key):
key = future_to_key[future]
exception = future.exception()
if not exception:
yield key, future.result()
else:
yield key, exception
You can try this out. This is fast
import boto3
from multiprocessing import Pool
bucket_name = 'BUCKET_NAME'
prefix = 'PREFIX'
local_dir = './downloads/' # PUT YOUR LOCAL DIR
max_process = 20 # CAN BE CHANGE
debug_en = True
# pass your credentials and region name
s3_client = boto3.client('s3',aws_access_key_id=' ',
aws_secret_access_key=' ', region_name=' ')
def downfiles(bucket_name, src_obj, dest_path):
try:
s3_client.download_file(bucket_name, src_obj, dest_path)
if debug_en:
print("[dubug] downloading object: %s to %s" %(src_obj, dest_path))
except:
pass
def download_dir(bucket_name, sub_prefix):
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=sub_prefix)
pool = Pool(max_process)
print(pool)
mp_data = []
for page in pages:
if 'Contents' in page:
for obj in page['Contents']:
src_obj = obj['Key']
dest_path = local_dir + src_obj
mp_data.append((bucket_name, src_obj, dest_path))
os.path.dirname(dest_path) and os.makedirs(os.path.dirname(dest_path), exist_ok=True)
pool.starmap(downfiles, mp_data)
return len(mp_data)
if __name__ == '__main__':
print("starting script...")
start_time = datetime.now()
s3_dirs = prefix
total_files = 0
for s3_dir in s3_dirs:
print("[Information] %s directory is downloading" % s3_dir)
no_files = download_dir(bucket_name, s3_dir)
total_files = total_files + no_files
end_time = datetime.now()
print('Duration: {}'.format(end_time - start_time))
print('Total File numbers: %d' % total_files)
print("ended")

How can I use nessrest api (python) to export nessus scan reports in xml?

I am trying to automate the running of and downloading nessus scans using python. I have been using the nessrest api for python, and am able to successfully run a scan, but am not being successfully download the report in nessus format.
Any ideas how I can do this? I have been using the module scan_download, but that actually executes before my scan even finishes.
Thanks for the help in advance!
Just looking back at this question, heres an example of using Nessrest API to pull down CSV report exports from you nessus host,
#!/usr/bin/python2.7
import sys
import os
import io
from nessrest import ness6rest
file_format = 'csv' # options: nessus, csv, db, html
dbpasswd = ''
scan = ness6rest.Scanner(url="https://nessus:8834", login="admin", password="P#ssword123", insecure=True)
scan.action(action='scans', method='get')
folders = scan.res['folders']
scans = scan.res['scans']
if scan:
scan.action(action='scans', method='get')
folders = scan.res['folders']
scans = scan.res['scans']
for f in folders:
if not os.path.exists(f['name']):
if not f['type'] == 'trash':
os.mkdir(f['name'])
for s in scans:
scan.scan_name = s['name']
scan.scan_id = s['id']
folder_name = next(f['name'] for f in folders if f['id'] == s['folder_id'])
folder_type = next(f['type'] for f in folders if f['id'] == s['folder_id'])
# skip trash items
if folder_type == 'trash':
continue
if s['status'] == 'completed':
file_name = '%s_%s.%s' % (scan.scan_name, scan.scan_id, file_format)
file_name = file_name.replace('\\','_')
file_name = file_name.replace('/','_')
file_name = file_name.strip()
relative_path_name = folder_name + '/' + file_name
# PDF not yet supported
# python API wrapper nessrest returns the PDF as a string object instead of a byte object, making writing and correctly encoding the file a chore...
# other formats can be written out in text mode
file_modes = 'wb'
# DB is binary mode
#if args.format == "db":
# file_modes = 'wb'
with io.open(relative_path_name, file_modes) as fp:
if file_format != "db":
fp.write(scan.download_scan(export_format=file_format))
else:
fp.write(scan.download_scan(export_format=file_format, dbpasswd=dbpasswd))
can see more examples here,
https://github.com/tenable/nessrest/tree/master/scripts

Convert non-strict JSON Amazon SNAP metadata to Pandas DataFrame

I am trying to convert this Amazon sample Snap grocery JSON data to a Pandas dataframe in IBM Bluemix (using Python 2.x) and then analyze it with Apache Spark.
I have unzipped the JSON file and uploaded it to an Apache Spark Container.
Here is my container connection:
# In[ ]:
credentials_1 = {
'auth_uri':'',
'global_account_auth_uri':'',
'username':'myUname',
'password':"myPw",
'auth_url':'https://identity.open.softlayer.com',
'project':'object_storage_988dfce6_5b93_48fc_9575_198bbed3abfc',
'project_id':'2c05de8a36d74d32bdbe0eeec7e5a372',
'region':'dallas',
'user_id':'4976489bab7d489f8d2eba681adacb78',
'domain_id':'8b6bc3e989d644858d7b74f24119447a',
'domain_name':'1079761',
'filename':'meta_Grocery_and_Gourmet_Food.json',
'container':'grocery',
'tenantId':'s31d-8e24c13d9c36f4-43b43b7b993d'
}
I then used Apache Spark's sample of importing data from container to StringIO
# In[ ]:
import requests, StringIO, pandas as pd, json, re
# In[ ]:
def get_file_content(credentials):
"""For given credentials, this functions returns a StringIO object containing the file content."""
url1 = ''.join([credentials['auth_url'], '/v3/auth/tokens'])
data = {'auth': {'identity': {'methods': ['password'],
'password': {'user': {'name': credentials['username'],'domain': {'id': credentials['domain_id']},
'password': credentials['password']}}}}}
headers1 = {'Content-Type': 'application/json'}
resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
resp1_body = resp1.json()
for e1 in resp1_body['token']['catalog']:
if(e1['type']=='object-store'):
for e2 in e1['endpoints']:
if(e2['interface']=='public'and e2['region']==credentials['region']):
url2 = ''.join([e2['url'],'/', credentials['container'], '/', credentials['filename']])
s_subject_token = resp1.headers['x-subject-token']
headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
resp2 = requests.get(url=url2, headers=headers2)
return StringIO.StringIO(resp2.content)
I then converted the String content to a strict JSON pattern by appending [ and ] at the beginning and at the end and by separating the data with a comma.
print('----------------------\n')
import json
myDf=[];
def parse(data):
for l in data:
yield json.dumps(eval(l))
def getDF(data):
st='['
i = 0
df =[]
for d in parse(data):
if i<100:
i += 1
#print(str(d))
st=st+str(d)+','
#print('----------------\n')
st=st[:-1]
st=st+']'
#js=json.loads(st)
#print(json.dumps(js))
return pd.read_json(st)
content_string = get_file_content(credentials_1)
df = getDF(content_string)
df.head()
I am getting a perfectly desirable result.
Output of the code
The problem is that when I remove i < 100 condition, it just never completes and the kernel remains busy for over one hour.
Is there any other elegant ways to convert the the data into dataframe?
Also, ijson is not available with Bluemix Notebook.
Let me answer this in two parts:-
You can install ijson in your bluemix spark service using below command and then user import ijson to further use it as per your use.
!pip install --user ijson
You can use sqlContext.jsonFile to read the json from object storage rather than going around to define your schema.
This will even infer the schema for you and then you can run some spark-sql queries to do whatever you want with the dataframe.
df = sqlContext.jsonFile("swift://" + objectStorageCreds['container'] + "." + objectStorageCreds['name'] + "/" + objectStorageCreds['filename'])
Here is the link to complete notebook.
If you have to work with pandas dataframe , you can simply convert it to
df.toPandas().head()
But this will cause to get everything at the driver node(use carefully).
Thanks,
Charles.