Related
I am trying to run a query in BigQuery right after a dataflow job completes successfully. I have defined 3 different functions in main.py.
The first one is for running the dataflow job. The second one checks the dataflow jobs status. And the last one runs the query in BigQuery.
The trouble is the second function checks the dataflow job status multiple times for a period of time and after the dataflow job completes successfully, it does not stop checking the status.
And then function deployment fails due to 'function load attempt timed out' error.
from googleapiclient.discovery import build
from oauth2client.client import GoogleCredentials
import os
import re
import config
from google.cloud import bigquery
import time
global flag
def trigger_job(gcs_path, body):
credentials = GoogleCredentials.get_application_default()
service = build('dataflow', 'v1b3', credentials=credentials, cache_discovery=False)
request = service.projects().templates().launch(projectId=config.project_id, gcsPath=gcs_path, body=body)
response = request.execute()
def get_job_status(location, flag):
credentials=GoogleCredentials.get_application_default()
dataflow=build('dataflow', 'v1b3', credentials=credentials, cache_discovery=False)
result=dataflow.projects().jobs().list(projectId=config.project_id, location=location).execute()
for job in result['jobs']:
if re.findall(r'' + re.escape(config.job_name) + '', job['name']):
while flag==0:
if job['currentState'] != "JOB_STATE_DONE":
print('NOT DONE')
else:
flag=1
print('DONE')
break
def bq(sql):
client = bigquery.Client()
query_job = client.query(sql, location='US')
gcs_path = config.gcs_path
body=config.body
trigger_job(gcs_path,body)
flag=0
location='us-central1'
get_job_status(location,flag)
sql= """CREATE OR REPLACE TABLE 'table' AS SELECT * FROM 'table'"""
bq(SQL)
Cloud Function timeout is set to 540 seconds but deployment fails in 3-4 minutes.
Any help is very appreciated.
It appears from the code snippet provided that your HTTP-triggered cloud function is not returning a HTTP response.
All HTTP-based cloud functions must return a HTTP response for proper termination. From the google documentation Ensure HTTP functions send an HTTP response (Emphasis - mine):
If your function is HTTP-triggered, remember to send an HTTP response,
as shown below. Failing to do so can result in your function executing
until timeout. If this occurs, you will be charged for the entire
timeout time. Timeouts may also cause unpredictable behavior or cold
starts on subsequent invocations, resulting in unpredictable behavior
or additional latency.
Thus, you must have a function that in your main.py that returns some sort of value, ideally a value that can be coerced into a Flask http response.
We started using Dataflow to read from PubSub and Stream to BigQuery.
Dataflow should work 24/7, because pubsub is constantly updated with analytics data of multiple websites around the world.
Code looks like this:
from __future__ import absolute_import
import argparse
import json
import logging
import apache_beam as beam
from apache_beam.io import ReadFromPubSub, WriteToBigQuery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
logger = logging.getLogger()
TABLE_IDS = {
'table_1': 0,
'table_2': 1,
'table_3': 2,
'table_4': 3,
'table_5': 4,
'table_6': 5,
'table_7': 6,
'table_8': 7,
'table_9': 8,
'table_10': 9,
'table_11': 10,
'table_12': 11,
'table_13': 12
}
def separate_by_table(element, num):
return TABLE_IDS[element.get('meta_type')]
class ExtractingDoFn(beam.DoFn):
def process(self, element):
yield json.loads(element)
def run(argv=None):
"""Main entry point; defines and runs the wordcount pipeline."""
logger.info('STARTED!')
parser = argparse.ArgumentParser()
parser.add_argument('--topic',
dest='topic',
default='projects/PROJECT_NAME/topics/TOPICNAME',
help='Gloud topic in form "projects/<project>/topics/<topic>"')
parser.add_argument('--table',
dest='table',
default='PROJECTNAME:DATASET_NAME.event_%s',
help='Gloud topic in form "PROJECT:DATASET.TABLE"')
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
pipeline_options.view_as(SetupOptions).save_main_session = True
p = beam.Pipeline(options=pipeline_options)
lines = p | ReadFromPubSub(known_args.topic)
datas = lines | beam.ParDo(ExtractingDoFn())
by_table = datas | beam.Partition(separate_by_table, 13)
# Create a stream for each table
for table, id in TABLE_IDS.items():
by_table[id] | 'write to %s' % table >> WriteToBigQuery(known_args.table % table)
result = p.run()
result.wait_until_finish()
if __name__ == '__main__':
logger.setLevel(logging.INFO)
run()
It works fine but after some time (2-3 days) it stops streaming for some reason.
When I check job status, it contains no errors in the logs section (you know, ones marked with red "!" in dataflow's job details). If I cancel the job and run it again - it starts working again, as usual.
If I check Stackdriver for additional logs, here's all Errors that happened:
Here's some warnings that occur periodically while job executes:
Details of one of them:
{
insertId: "397122810208336921:865794:0:479132535"
jsonPayload: {
exception: "java.lang.IllegalStateException: Cannot be called on unstarted operation.
at com.google.cloud.dataflow.worker.fn.data.RemoteGrpcPortWriteOperation.getElementsSent(RemoteGrpcPortWriteOperation.java:111)
at com.google.cloud.dataflow.worker.fn.control.BeamFnMapTaskExecutor$SingularProcessBundleProgressTracker.updateProgress(BeamFnMapTaskExecutor.java:293)
at com.google.cloud.dataflow.worker.fn.control.BeamFnMapTaskExecutor$SingularProcessBundleProgressTracker.periodicProgressUpdate(BeamFnMapTaskExecutor.java:280)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
"
job: "2018-11-30_10_35_19-13557985235326353911"
logger: "com.google.cloud.dataflow.worker.fn.control.BeamFnMapTaskExecutor"
message: "Progress updating failed 4 times. Following exception safely handled."
stage: "S0"
thread: "62"
work: "c-8756541438010208464"
worker: "beamapp-vitar-1130183512--11301035-mdna-harness-lft7"
}
labels: {
compute.googleapis.com/resource_id: "397122810208336921"
compute.googleapis.com/resource_name: "beamapp-vitar-1130183512--11301035-mdna-harness-lft7"
compute.googleapis.com/resource_type: "instance"
dataflow.googleapis.com/job_id: "2018-11-30_10_35_19-13557985235326353911"
dataflow.googleapis.com/job_name: "beamapp-vitar-1130183512-742054"
dataflow.googleapis.com/region: "europe-west1"
}
logName: "projects/PROJECTNAME/logs/dataflow.googleapis.com%2Fharness"
receiveTimestamp: "2018-12-03T20:33:00.444208704Z"
resource: {
labels: {
job_id: "2018-11-30_10_35_19-13557985235326353911"
job_name: "beamapp-vitar-1130183512-742054"
project_id: PROJECTNAME
region: "europe-west1"
step_id: ""
}
type: "dataflow_step"
}
severity: "WARNING"
timestamp: "2018-12-03T20:32:59.442Z"
}
Here's the moment when it seems to start having problems:
Additional info messages that may help:
According to these messages, we don't run out of memory/processing power etc. The job is run with these parameters:
python -m start --streaming True --runner DataflowRunner --project PROJECTNAME --temp_location gs://BUCKETNAME/tmp/ --region europe-west1 --disk_size_gb 30 --machine_type n1-standard-1 --use_public_ips false --num_workers 1 --max_num_workers 1 --autoscaling_algorithm NONE
What could be the problem here?
This isn't really an answer, more helping identify the cause: so far, all streaming Dataflow jobs I've launched using python SDK have stopped that way after some days, whether they use BigQuery as sink or not. So the cause rather seems to be the general fact that streaming jobs with the python SDK are still in beta.
My personal solution: use the Dataflow templates to stream from Pub/Sub to BigQuery (thus avoiding the python SDK), then schedule queries in BigQuery to periodically treat the data. Unfortunately that might not be appropriate for your use cases.
in my company we are experiencing the same and identical problem, as described by the OP, with a similar use case.
Unfortunately the problem is real, concrete and apparently with a random occurrence.
As a workaround, we are considering rewriting our pipeline using the java SDK.
I had a similar issue to this and found that the warning logs contained python Stack trace hidden in the java logs advising of errors.
These errors were continually re-tried by workers causing them to crash and completely freeze the pipeline. I initially thought the No. of workers was too low, so scaled up the number of workers, but the pipeline just took longer to freeze.
I ran the pipeline locally and exported the pubsub messages as text and identified they contained dirty data(messages that did not match the BQ table schema) and as I had no exception handling, that seemed to be the cause of the pipeline to freeze.
Adding a function only accept a record where the first key matches the expected column of your BQ Schema fixed my issue and the Dataflow Job has been running with no issues ongoing.
def bad_records(row):
if 'key1' in row:
yield row
else:
print('bad row',row)
|'exclude bad records' >> beam.ParDo(bad_records)
I am using AWS StepFunctions to carry out several tasks on the Google Cloud side - creating a Dataproc cluster, submitting a task to it, and then tearing it down (each of which have their own Task state, as well as "poller" tasks that check when the jobs have been finished in order to move onto the next Task).
The issue is, for tearing down the cluster, the Task goes into the "cancelled" (gray) status instead of "in progress", followed by the poller Task. Once the cluster deletion lambda function executes the cluster deletion method, it should move on to the poller Task.
Here is a look at the cluster deletion lambda function:
from pprint import pprint
from google.cloud import storage
import googleapiclient.discovery
from rkstr8.cloud.google import GoogleCloudLambdaAuth
import time
def handler(event, context):
creds = event['GCP_creds']
GoogleCloudLambdaAuth(creds).configure_google_creds()
dataproc = googleapiclient.discovery.build('dataproc', 'v1')
project_id = event['gcp-administrative']['project']
zone = event['gcp-administrative']['zone']
try:
region_as_list = zone.split('-')[:-1]
region = '-'.join(region_as_list)
except (AttributeError, IndexError, ValueError):
raise ValueError('Invalid zone provided, please check your input.')
cluster = event['dataproc-administrative']['cluster_name']
print('Tearing down cluster...')
request = dataproc.projects().regions().clusters().delete(
projectId=project_id,
region=region,
clusterName=cluster)
time.sleep(30)
result = request.execute()
return result
Here is what the relevant part of the state machine building code looks like:
dproc_submit_state = AsyncPoller(
stats_path=DPROC_SUBMIT_POLLER_STATUS_PATH,
async_task=Task(
name=DPROC_SUBMIT,
resource=DPROC_SUBMIT_ARN_VAR,
input_path=DPROC_SUBMIT_INPUT_PATH,
result_path=DPROC_SUBMIT_RESULT_PATH,
next=DPROC_SUBMIT_POLLER
),
pollr_task=Task(
name=DPROC_SUBMIT_POLLER,
resource=DPROC_SUBMIT_POLLER_ARN_VAR,
input_path=DPROC_SUBMIT_RESULT_PATH,
result_path=DPROC_SUBMIT_POLLER_STATUS_PATH
),
faild_task=Fail(
name='HailScriptFailed'
),
succd_task=DPROC_DELETE,
pollr_wait_time=self.conf["POLLER_WAIT_TIME"]
).states()
dproc_delete_state = AsyncPoller(
stats_path=DPROC_DELETE_POLLER_STATUS_PATH,
async_task=Task(
name=DPROC_DELETE,
resource=DPROC_DELETE_ARN_VAR,
input_path=DPROC_DELETE_INPUT_PATH,
result_path=DPROC_DELETE_RESULT_PATH,
next=DPROC_DELETE_POLLER
),
pollr_task=Task(
name=DPROC_DELETE_POLLER,
resource=DPROC_DELETE_POLLER_ARN_VAR,
input_path=DPROC_DELETE_RESULT_PATH,
result_path=DPROC_DELETE_POLLER_STATUS_PATH
),
faild_task=Fail(
name='ClusterDeleteFailed'
),
succd_task='PipelineSucceeded',
pollr_wait_time=self.conf["POLLER_WAIT_TIME"]
).states()
Here is what the state machine looks like:
Why are you sleeping for 30 seconds between creating a request and executing it?
The default timeout for lambda is 3 seconds. My guess is that your lambda is just timing out.
My python code looks like :
import json
import boto.sqs
import boto
from boto.sqs.connection import SQSConnection
from boto.sqs.message import Message
from boto.sqs.message import RawMessage
sqs = boto.connect_sqs(aws_access_key_id='XXXXXXXXXXXXXXX',aws_secret_access_key='XXXXXXXXXXXXXXXXX')
q = sqs.create_queue("Nishantqueue") // Already present
q.set_message_class(RawMessage)
results = q.get_messages()
ret = "Got %s result(s) this time.\n\n" % len(results)
for result in results:
msg = json.loads(result.get_body())
ret += "Message: %s\n" % msg['message']
ret += "\n... done."
print ret
My SQS queue contains atleast 5 to 6 messages... when i execute this ... i get output as this and this is on every run, this code isn't able to pull the mssgs from the queue :
Got 0 result(s) this time.
...done.
I am sure i am missing something in the loop.... couldn't find though
Your code is retrieving messages from an Amazon SQS queue, but it doesn't seem to be deleting them. This means that messages will be invisible for a period of time (specified by the visibility_timeout parameter), after which they will reappear. The expectation is that if a message is not deleted within this time, then it has failed to be processed and should reappear on the queue to try again.
Here's some code that pulls a message from a queue, then deletes it after processing. Note the visibility_timeout specified when a message is retrieved. It is using read() to simply return one message:
#!/usr/bin/python27
import boto, boto.sqs
from boto.sqs.message import Message
# Connect to Queue
q_conn = boto.sqs.connect_to_region("ap-southeast-2")
q = q_conn.get_queue('queue-name')
# Get a message
m = q.read(visibility_timeout=15)
if m == None:
print "No message!"
else:
print m.get_body()
q.delete_message(m)
It's possible that your messages were invisible ("in-flight") when you tried to retrieve them.
I am using Python 2.7 and Jenkins.
I am writing some code in Python that will perform a checkin and wait/poll for Jenkins job to be complete. I would like some thoughts on around how I achieve it.
Python function to create a check-in in Perforce-> This can be easily done as P4 has CLI
Python code to detect when a build got triggered -> I have the changelist and the job number. How do I poll the Jenkins API for the build log to check if it has the appropriate changelists? The output of this step is a build url which is carrying out the job
How do I wait till the Jenkins job is complete?
Can I use snippets from the Jenkins Rest API or from Python Jenkins module?
If you need to know if the job is finished, the buildNumber and buildTimestamp are not enough.
This is the gist of how I find out if a job is complete, I have it in ruby but not python so perhaps someone could update this into real code.
lastBuild = get jenkins/job/myJob/lastBuild/buildNumber
get jenkins/job/myJob/lastBuild/build?token=gogogo
currentBuild = get jenkins/job/myJob/lastBuild/buildNumber
while currentBuild == lastBuild
sleep 1
thisBuild = get jenkins/job/myJob/lastBuild/buildNumber
buildInfo = get jenkins/job/myJob/[thisBuild]/api/xml?depth=0
while buildInfo["freeStyleBuild/building"] == true
buildInfo = get jenkins/job/myJob/[thisBuild]/api/xml?depth=0
sleep 1
ie. I found I needed to A) wait until the build starts (new build number) and B) wait until the building finishes (building is false).
You can query the last build timestamp to determine if the build finished. Compare it to what it was just before you triggered the build, and see when it changes. To get the timestamp, add /lastBuild/buildTimestamp to your job URL
As a matter of fact, in your Jenkins, add /lastBuild/api/ to any Job, and you will see a lot of API information. It even has Python API, but I not familiar with that so can't help you further
However, if you were using XML, you can add lastBuild/api/xml?depth=0 and inside the XML, you can see the <changeSet> object with list of revisions/commit messages that triggered the build
Simple solution using invoke and block_until_complete methods (tested with Python 3.7)
import jenkinsapi
from jenkinsapi.jenkins import Jenkins
...
server = Jenkins(jenkinsUrl, username=jenkinsUser,
password=jenkinsToken, ssl_verify=sslVerifyFlag)
job = server.create_job(jobName, None)
queue = job.invoke()
queue.block_until_complete()
Inpsired by a test method in pycontribs
This snippet starts build job and wait until job is done.
It is easy to start the job but we need some kind of logic to know when job is done. First we need to wait for job ID to be applied and than we can query job for details:
from jenkinsapi import jenkins
server = jenkins.Jenkins(jenkinsurl, username=username, password='******')
job = server.get_job(j_name)
prev_id = job.get_last_buildnumber()
server.build_job(j_name)
while True:
print('Waiting for build to start...')
if prev_id != job.get_last_buildnumber():
break
time.sleep(3)
print('Running...')
last_build = job.get_last_build()
while last_build.is_running():
time.sleep(1)
print(str(last_build.get_status()))
Don't know if this was available at the time of the question, but jenkinsapi module's Job.invoke() and/or Jenkins.build_job() return a QueueItem object, which can block_until_building(), or block_until_complete()
jobq = server.build_job(job_name, job_params)
jobq.block_until_building()
print("Job %s (%s) is building." % (jobq.get_job_name(), jobq.get_build_number()))
jobq.block_until_complete(5) # check every 5s instead of the default 15
print("Job complete, %s" % jobq.get_build().get_status())
Was going through the same problem and this worked for me, using python3 and python-jenkins.
while "".join([d['color'] for d in j.get_jobs() if d['name'] == "job_name"]) == 'blue_anime':
print('Job is Running')
time.sleep(1)
print('Job Over!!')
Working Github Script: Link
This is working for me
#!/usr/bin/env python
import jenkins
import time
server = jenkins.Jenkins('https://jenkinsurl/', username='xxxxx', password='xxxxxx')
j_name = 'test'
server.build_job(j_name, {'testparam1': 'test', 'testparam2': 'test'})
while True:
print('Running....')
if server.get_job_info(j_name)['lastCompletedBuild']['number'] == server.get_job_info(j_name)['lastBuild']['number']:
print "Last ID %s, Current ID %s" % (server.get_job_info(j_name)['lastCompletedBuild']['number'], server.get_job_info(j_name)['lastBuild']['number'])
break
time.sleep(3)
print('Stop....')
console_output = server.get_build_console_output(j_name, server.get_job_info(j_name)['lastBuild']['number'])
print console_output
the issue main issue that the build_job doesn't return the number of the job, returns the number of a queue item (that only last 5 min). so the trick is
build_job
get the queue number,
with the queue number get the job_number
now we know the name of the job and the job number
get_job_info and loop the jobs till we find one with our job number
check the status
so i made a function for it with time_out
import time
from datetime import datetime, timedelta
import jenkins
def launch_job(jenkins_connection, job_name, parameters={}, wait=False, interval=30, time_out=7200):
"""
Create a jenkins job and waits for the job to finish
:param jenkins_connection: jenkins server jenkins object
:param job_name: the name of job we want to create and see if finish string
:param parameters: the parameters of the job to build directory
:param wait: if we want to wait for the job to finish or not bool
:param interval: how often we want to monitor seconds int
:param time_out: break the loop after certain X seconds int
:return: build job number int
"""
# we lunch the job and returns a queue_id
job_id = jenkins_connection.build_job(job_name, parameters)
# from the queue_id we get the job number that was created
queue_job = jenkins_connection.get_queue_item(job_id, depth=0)
build_number = queue_job["executable"]["number"]
print(f"job_name: {job_name} build_number: {build_number}")
if wait is True:
now = datetime.now()
later = now + timedelta(seconds=time_out)
while True:
# we check current time vs the timeout(later)
if datetime.now() > later:
raise ValueError(f"Job: {job_name}:{build_number} is running for more than {time_out} we"
f"stop monitoring the job, you can check it in Jenkins")
b = jenkins_connection.get_job_info(job_name, depth=1, fetch_all_builds=False)
for i in b["builds"]:
loop_id = i["id"]
if int(loop_id) == build_number:
result = (i["result"])
print(f"result: {result}") # in the json looks like null
if result is not None:
return i
# break
time.sleep(interval)
# return result
return build_number
after we ask jenkins to build the job>get queue#>get job#> loop the info and get the status till change from None to something else.
if works will return the directory with the information of that job. (hope the jenkins library could implement something like this.)