I am trying to build an akka based system which will periodically(every 15 second) send REST request, do some filter, some data cleansing and validation on the received data and save into HDFS.
Below is the code that I wrote.
import akka.actor.ActorSystem
import akka.stream.ActorMaterializer
import akka.stream.scaladsl.{Flow, Sink, Source}
import akka.http.scaladsl.Http
import akka.http.scaladsl.model.{HttpRequest, HttpResponse, StatusCodes}
import akka.actor.Props
import akka.event.Logging
import akka.actor.Actor
import scala.concurrent.{ExecutionContext, Future}
import scala.util.Try
import akka.http.scaladsl.client.RequestBuilding._
/**
* Created by rabbanerjee on 4/6/2017.
*/
class MyActor extends Actor {
val log = Logging(context.system, this)
import scala.concurrent.ExecutionContext.Implicits.global
def receive = {
case j:HttpResponse => log.info("received" +j)
case k:AnyRef => log.info("received unknown message"+k)
}
}
object STest extends App{
implicit val system = ActorSystem("Sys")
import system.dispatcher
implicit val materializer = ActorMaterializer()
val ss = system.actorOf(Props[MyActor])
val httpClient = Http().outgoingConnection(host = "rest_server.com", port = 8080)
val filterSuccess = Flow[HttpResponse].filter(_.status.isSuccess())
val runnnn = Source.tick(
FiniteDuration(1,TimeUnit.SECONDS),
FiniteDuration(15,TimeUnit.SECONDS),
Get("/"))
.via(httpClient)
.via(filterSuccess)
.to(Sink.actorRef(ss,onCompleteMessage = "done"))
runnnn.run()
}
The problem I am currently facing is,
Even though I used a repeat/tick with source, I can see the result once. It's not repetitively firing the request.
I am also trying to find grouping the result of say 50 such request, coz as I will be writing it to hadoop, I cant write every request, as it will flood HDFS with multiple file.
You are not consuming the responses you are getting back from the HTTP call. It is compulsory to consume the entity bytes returned by Akka HTTP, even if you are not interested in them.
More about this can be found in the docs.
In your example, as you are not using the response entity, you can just discard its bytes. See example below:
val runnnn = Source.tick(FiniteDuration(1,TimeUnit.SECONDS),FiniteDuration(15,TimeUnit.SECONDS),Get("/"))
.via(httpClient)
.map{resp => resp.discardEntityBytes(); resp}
.via(filterSuccess)
.to(Sink.actorRef(ss,onCompleteMessage = "done"))
Related
I'am trying to display all sharepoint's list name but i'am getting this error :
No handlers could be found for logger "office365.runtime.auth.saml_token_provider.SamlTokenProvider._process_service_token_response"
This is my code :
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
url = 'https://abc.sharepoint.com/sites/siteName/'
ctx_auth = AuthenticationContext(url)
if ctx_auth.acquire_token_for_user(username='username#abc.com'
,password ='password'):
ctx = ClientContext(url, ctx_auth)
lists = ctx.web.lists
ctx.load(lists)
ctx.execute_query()
for l in lists:
print(l.properties["Title"])
Thanks
I tested below code here with python 2.7 and it works well.
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
tenant_url= "https://company.sharepoint.com"
site_url="https://company.sharepoint.com/sites/sname"
ctx_auth = AuthenticationContext(tenant_url)
if ctx_auth.acquire_token_for_user("abc#company.onmicrosoft.com","mypassword"):
ctx = ClientContext(site_url, ctx_auth)
lists = ctx.web.lists
ctx.load(lists)
ctx.execute_query()
for l in lists:
print(l.properties["Title"])
else:
print(ctx_auth.get_last_error())
Result:
If this is related to ADFS, please refer to this closed question:
https://github.com/vgrem/Office365-REST-Python-Client/issues/85
BR
Well i found a solution to get data for specific sharepoint List
from shareplum import Site
from shareplum import Office365
import json
import csv
import pandas
authcookie = Office365('https://abc.sharepoint.com/', username='username', password='password').GetCookies()
site = Site('https://abc.sharepoint.com/sites/SitesName/', authcookie=authcookie)
sp_list = site.List('ListName')
#print(sp_list)
data = sp_list.GetListItems(fields=['FieldName1','FieldName2'])
c = pandas.read_json(json.dumps(data)).to_csv("output.csv")
I have a Raspberry Pi, using Wifi, it is running a people counting model, and will send the processed image to my server using a ZeroMQ socket. On this server I build a web server, using Flask. I got an error that is not shown during the streaming for the first person who accessed the website, but the next access following the first one will fail:
zmq.error.ZMQError: Address in use
What should i do to more people access my server and can see video streaming?
The server code :
from flask import Flask, render_template, Response, request, jsonify
from flask_restful import Api
import numpy as np
from api.configs import configs
from flask_cors import CORS
import zmq
import cv2
import base64
app = Flask(__name__, static_url_path='/static')
api_restful = Api(app)
cors = CORS(app)
def gen():
context = zmq.Context()
footage_socket = context.socket(zmq.SUB)
footage_socket.bind('tcp://*:{}'.format(configs.SOCKET_PORT))
footage_socket.setsockopt_string(zmq.SUBSCRIBE, np.unicode(''))
while True:
frame = footage_socket.recv()
npimg = np.fromstring(frame, dtype=np.uint8)
source = cv2.imdecode(npimg, 1)
s = cv2.imencode('.jpg', source)[1].tobytes()
yield ( b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + s + b'\r\n' )
#app.route('/video_feed')
def video_feed():
return Response( gen(),
mimetype = 'multipart/x-mixed-replace; boundary=frame'
)
if __name__ == '__main__':
app.run( debug = True,
threaded = True,
port = configs.PORT,
host = configs.HOST
)
The Raspberry Pi client code :
import socket
import logging as log
import numpy as np
from api.configs import configs
import zmq
import numpy as np
import cv2
context = zmq.Context()
footage_socket = context.socket(zmq.PUB)
footage_socket.connect( 'tcp://{}:{}'.format( configs.SERVER_HOST,
configs.SOCKET_PORT )
)
time.sleep(1)
cap = cv2.VideoCapture(0)
while True:
while cap.isOpened():
ret, frame = cap.read()
frame = cv2.resize(frame, (640, 480))
encoded, buffer_ = cv2.imencode('.jpg', frame)
footage_socket.send(buffer_)
I omitted some code to make it easier to see
I cannot speak about the details hidden inside the Flask() web-served video re-wrapping/delivery, yet the ZeroMQ part seems to be the trouble, so :
a )it seems that every web-watcher spawns another #app.route( '/video_feed' )-decorated Flask()-instance ( check the mode - if just a thread-based, or a process-based - it is important, whether a central Context() instance may or cannot get shared and efficiently re-used ).
b ) the code, residing inside there hooked Response()-feeder, attempts to .bind() for each subsequent Flask()-served user, which for obvious reasons collides, as the resource ( the address:port ) has been POSACK'ed for exclusive use for the first-came, first served visitor ( any next one must crash and crashed as documented above ).
PROFESSIONAL SOLUTION ( or just a quick-but-dirty-fix ? ):
For indeed small scale use-cases, it shall be enough to reverse the .bind()/.connect()-s, i.e. the PUB will .bind() ( as it does not replicate itself and can serve any small counts of further SUB(s), arriving via .connect()-s )
However, this quick-fix is a bit dirty. This should never be done in a production-grade system. The workload envelope is soon to grow above reasonable levels. The proper architecture should work in a different manner.
instantiate a central zmq.Context( configs.nIOthreads )-instance ( could be shared to re-use in called methods ), having thus also a central scaling of the nIOthreads for performance reasons.
instantiate a one, central SUB-instance, that communicates with Rpi and collects video-frames, to avoid any re-duplications of image/serving
configure both sides to use .setsockopt( zmq.CONFLATE ) to principally avoid any re-broadcasting of a video "History of No Value"
instantiate all the #app-decorated Flask()-tools to re-use the central Context()-instance and to internally connect to a central video-re-PUB-lisher, using a SUB-side of another PUB/SUB archetype, here, using an efficient memory-mapped inproc://, again using .setsockopt( zmq.CONFLATE )
configure all resources to have higher robustness - at least with an explicit .setsockopt( zmq.LINGER, 0 )
detect and handle all error-states, as the ZeroMQ API documents all of them well enough to help you diagnose + manage any failed state professionally and safely
Do emr-dynamodb-connector reads data in parallel in spark? I checked that RDD which I am getting from it has only one partition.
import org.apache.hadoop.io.Text;
import org.apache.hadoop.dynamodb.DynamoDBItemWritable
import org.apache.hadoop.dynamodb.read.DynamoDBInputFormat
import org.apache.hadoop.dynamodb.write.DynamoDBOutputFormat
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.io.LongWritable
var jobConf = new JobConf(sc.hadoopConfiguration)
jobConf.set("dynamodb.input.tableName", "TableName")
jobConf.set("mapred.output.format.class", "org.apache.hadoop.dynamodb.write.DynamoDBOutputFormat")
jobConf.set("mapred.input.format.class", "org.apache.hadoop.dynamodb.read.DynamoDBInputFormat")
var orders = sc.hadoopRDD(jobConf, classOf[DynamoDBInputFormat], classOf[Text], classOf[DynamoDBItemWritable])
Above code to read data from DynamoDB. Following is the number of partitions.
scala> orders.getNumPartitions
res4: Int = 1
Is there any way to read data in parallel and process it?
I am building an Akka Streams application that goes through a couple of steps. There is one particular step that produces 0 or more results, it is not known in advance how many there are. Each of the results has to be processed asynchronously (by the same kind of component) and finally all the results have to be merged.
How should I model this in Akka Streams? I noticed that GraphDsl has a Broadcast element that lets you model a fan out, however this seems to be only possible when the number of outlets is know in advance.
Is there a way in Akka Streams to have something like Broadcast but that fans out to a dynamic number of outlets?
Check out Hubs in this page: https://doc.akka.io/docs/akka/current/stream/stream-dynamic.html?language=scala
There are many cases when consumers or producers of a certain service (represented as a Sink, Source, or possibly Flow) are dynamic and not known in advance. The Graph DSL does not allow to represent this, all connections of the graph must be known in advance and must be connected upfront. To allow dynamic fan-in and fan-out streaming, the Hubs should be used.
It turns out that mapConcat does what I want. Here's a POC:
package streams
import scala.concurrent._
import akka._
import akka.actor._
import akka.stream._
import akka.stream.scaladsl._
import scala.util.Random
object StreamsTest extends App {
implicit val system = ActorSystem("TestSystem")
implicit val materializer = ActorMaterializer()
import system.dispatcher
case class SplitRequest(s: String)
def requestHandlerWithMultipleResults(request: SplitRequest): List[String] =
request.s.split(" ").toList
def slowProcessingTask(s: String) = {
Thread.sleep(Random.nextInt(5000))
s.toUpperCase
}
val g = RunnableGraph.fromGraph(GraphDSL.create() { implicit builder: GraphDSL.Builder[NotUsed] =>
import GraphDSL.Implicits._
val source: Source[String, NotUsed] = Source(List(SplitRequest("january february march april may")))
.mapConcat(requestHandlerWithMultipleResults)
.mapAsyncUnordered(5)(s => Future(slowProcessingTask(s)))
val sink = Sink.foreach(println)
source ~> sink
ClosedShape
})
g.run()
}
Output, e.g:
MAY
JANUARY
FEBRUARY
MARCH
APRIL
I need some help regarding the error in code. My Code consists of retrieving the zomato reviews and storing it in HDFS and again reading it performing Recommender Analtyics on it. I am getting a problem regarding my function is not recognizing in pyspark code. I am not entirely pasting the code as it might be confusing so i am writing a small similar use case for your easy understanding.
I am trying to read a file from local and converting it to dataframe from rdd and performing some operations and again converting it to rdd and performing map operation to have delimiter by '|' and then save it to HDFS.
When i try to call self.filter_data(y) in lambda func of check function its not recognizing and giving me error as
Exception: It appears that you are attempting to reference
SparkContext from a broadcast variable, action, or transformation.
SparkContext can only be used on the driver, not in code that it run
on workers. For more information, see SPARK-5063.
****CAN ANY ONE HELP ME WHY MY FILTER_DATA FUNCTION IS NOT RECOGNISING? SHOULD I NEED TO ADD ANY THING OR ANY THING WRONG IN THE WAY I AM CALLING. PLEASE HELP ME. THANKS IN ADVANCE****
INPUT VALUE
starting
0|0|ffae4f|0|https://b.zmtcdn.com/data/user_profile_pictures/565/aed32fa2eb18bb4a5a3ba426870fd565.jpg?fit=around%7C100%3A100&crop=100%3A100%3B%2A%2C%2A|https://www.zomato.com/akellaram87?utm_source=api_basic_user&utm_medium=api&utm_campaign=v2.1|2.5|FFBA00|Well...|unknown|16946626|2017-08-01T00-25-43.455182Z|30059877|Have been here for a quick bite for lunch, ambience and everything looked good, food was okay but presentation was not very appealing. We or...|2017-04-15 16:38:38|Big Foodie|6|Venkata Ram Akella|akellaram87|Bad Food|0.969352505662|0|0|0|0|0|0|1|1|0|0|1|0|0|0.782388212399
ending
starting
1|0|ffae4f|0|https://b.zmtcdn.com/data/user_profile_pictures/4d1/d70d7a57e1bfdf296ff4db3d8daf94d1.jpg?fit=around%7C100%3A100&crop=100%3A100%3B%2A%2C%2A|https://www.zomato.com/users/sm4-2011696?utm_source=api_basic_user&utm_medium=api&utm_campaign=v2.1|1|CB202D|Avoid!|unknown|16946626|2017-08-01T00-25-43.455182Z|29123338|Giving a 1.0 rating because one cannot proceed with writing a review, without rating it. This restaurant deserves a 0 star rating. The qual...|2017-01-04 10:54:53|Big Foodie|4|Sm4|unknown|Bad Service|0.964402034541|0|1|0|0|0|0|0|1|0|0|0|1|0|0.814540622345
ending
My code:
if __name__== '__main__':
import os,logging,sys,time,pandas,json;from subprocess
import PIPE,Popen,call;from datetime import datetime, time, timedelta
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('test')
sc = SparkContext(conf = conf,pyFiles=['/bdaas/exe/nlu_project/spark_classifier.py','/bdaas/exe/spark_zomato/other_files/spark_zipcode.py','/bdaas/exe/spark_zomato/other_files/spark_zomato.py','/bdaas/exe/spark_zomato/conf_files/spark_conf.py','/bdaas/exe/spark_zomato/conf_files/date_comparision.py'])
from pyspark.sql import Row, SQLContext,HiveContext
from pyspark.sql.functions import lit
sqlContext = HiveContext(sc)
import sys,logging,pandas as pd
import spark_conf
n = new()
n.check()
class new:
def __init__(self):
print 'entered into init'
def check(self):
data = sc.textFile('file:///bdaas/src/spark_dependencies/classifier_data/final_Output.txt').map(lambda x: x.split('|')).map(lambda z: Row(restaurant_id=z[0], rating = z[1], review_id = z[2],review_text = z[3],rating_color = z[4],rating_time_friendly=z[5],rating_text=z[6],time_stamp=z[7],likes=z[8],comment_count =z[9],user_name = z[10],user_zomatohandle=z[11],user_foodie_level = z[12],user_level_num=z[13],foodie_color=z[14],profile_url=z[15],profile_image=z[16],retrieved_time=z[17]))
data_r = sqlContext.createDataFrame(data)
data_r.show()
d = data_r.rdd.collect()
print d
data_r.rdd.map(lambda x: list(x)).map(lambda y: self.filter_data(y)).collect()
print data_r
def filter_data(self,y):
s = str()
for i in y:
print i.encode('utf-8')
if i != '':
s = s + i.encode('utf-8') + '|'
print s[0:-1]
return s[0:-1]