Uploading video to YouTube and adding it to playlist using YouTube Data API v3 in Python - python-2.7

I wrote a script to upload a video to YouTube using YouTube Data API v3 in the python with help of example given in Example code.
And I wrote another script to add uploaded video to playlist using same YouTube Data API v3 you can be seen here
After that I wrote a single script to upload video and add that video to playlist. In that I took care of authentication and scops still I am getting permission error. here is my new script
#!/usr/bin/python
import httplib
import httplib2
import os
import random
import sys
import time
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.file import Storage
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, httplib.NotConnected,
httplib.IncompleteRead, httplib.ImproperConnectionState,
httplib.CannotSendRequest, httplib.CannotSendHeader,
httplib.ResponseNotReady, httplib.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
CLIENT_SECRETS_FILE = "client_secrets.json"
# A limited OAuth 2 access scope that allows for uploading files, but not other
# types of account access.
YOUTUBE_UPLOAD_SCOPE = "https://www.googleapis.com/auth/youtube.upload"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
# Helpful message to display if the CLIENT_SECRETS_FILE is missing.
MISSING_CLIENT_SECRETS_MESSAGE = """
WARNING: Please configure OAuth 2.0
To make this sample run you will need to populate the client_secrets.json file
found at:
%s
with information from the APIs Console
https://code.google.com/apis/console#access
For more information about the client_secrets.json file format, please visit:
https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
""" % os.path.abspath(os.path.join(os.path.dirname(__file__),
CLIENT_SECRETS_FILE))
def get_authenticated_service():
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE, scope=YOUTUBE_UPLOAD_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run(flow, storage)
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
http=credentials.authorize(httplib2.Http()))
def initialize_upload(title,description,keywords,privacyStatus,file):
youtube = get_authenticated_service()
tags = None
if keywords:
tags = keywords.split(",")
insert_request = youtube.videos().insert(
part="snippet,status",
body=dict(
snippet=dict(
title=title,
description=description,
tags=tags,
categoryId='26'
),
status=dict(
privacyStatus=privacyStatus
)
),
# chunksize=-1 means that the entire file will be uploaded in a single
# HTTP request. (If the upload fails, it will still be retried where it
# left off.) This is usually a best practice, but if you're using Python
# older than 2.6 or if you're running on App Engine, you should set the
# chunksize to something like 1024 * 1024 (1 megabyte).
media_body=MediaFileUpload(file, chunksize=-1, resumable=True)
)
vid=resumable_upload(insert_request)
#Here I added lines to add video to playlist
#add_video_to_playlist(youtube,vid,"PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ")
#youtube = get_authenticated_service()
add_video_request=youtube.playlistItems().insert(
part="snippet",
body={
'snippet': {
'playlistId': "PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ",
'resourceId': {
'kind': 'youtube#video',
'videoId': vid
}
#'position': 0
}
}
).execute()
def resumable_upload(insert_request):
response = None
error = None
retry = 0
vid=None
while response is None:
try:
print "Uploading file..."
status, response = insert_request.next_chunk()
if 'id' in response:
print "'%s' (video id: %s) was successfully uploaded." % (
title, response['id'])
vid=response['id']
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError, e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
e.content)
else:
raise
except RETRIABLE_EXCEPTIONS, e:
error = "A retriable error occurred: %s" % e
if error is not None:
print error
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print "Sleeping %f seconds and then retrying..." % sleep_seconds
time.sleep(sleep_seconds)
return vid
if __name__ == '__main__':
title="sample title"
description="sample description"
keywords="keyword1,keyword2,keyword3"
privacyStatus="public"
file="myfile.mp4"
vid=initialize_upload(title,description,keywords,privacyStatus,file)
print 'video ID is :',vid
I am not able to figure out what is wrong. I am getting permission error. both script works fine independently.
could anyone help me figure out where I am wrong or how to achieve uploading video and adding that too playlist.

I got the answer actually in both the independent script scope is different.
scope for uploading is "https://www.googleapis.com/auth/youtube.upload"
scope for adding to playlist is "https://www.googleapis.com/auth/youtube"
as scope is different so I had to handle authentication separately.

Related

How to get requests with serverless lambda

I'm trying to get data from an url as a file and serve it back with the right mimetype.
I've tried a lot of different options this is some of the python flask code I currently have
## download video
#app.route('/download/<string:resource>')
def download(resource):
asset = getasset(resource)
# headers = {"Content-Type":"application/octet-stream","Accept-Encoding":"gzip, deflate, br","Accept":"*/*"}
response = requests.get(asset['downloads']['h264_720'], stream=True)
# length = response.headers.get('Content-Length')
def exhaust(response):
while True:
response.raw.decode_content = True
out = response.content.read(1024*1024)
if not out:
break
yield out
if IS_OFFLINE:
return Response(exhaust(response), mimetype='video/mp4')
else:
return Response(base64.b64decode(exhaust(response)), mimetype='video/mp4')
Offline the response is fine reviewing it locally with "serverless wsgi serve --stage dev"
Online the response is different (after doing "serverless deploy --stage dev"...
Please have a look at the image, left the correct mp4 video file. Right a file that is bigger and not a mp4 file.
It has something to do with base64.b64encode(r.content) but there is more to it.
I started of with this function:
### download video
# #app.route('/download/<string:resource>')
# def download(resource):
# asset = getasset(resource)
# r = requests.get(asset['downloads']['h264_720'],stream=True)
# if IS_OFFLINE:
# return Response(r.content, mimetype='video/mp4')
# else:
# return Response(base64.b64decode(r.content), mimetype='video/mp4')
This results in a file that looks like this and is only 200 bytes:
ftypisomisomiso2avc1mp41moovlmvhdTtraktkhd8edtselst8treftmcdmdiamdhd2UhdlrvideVideoHandlerjminfvmhddinfdrefurlstblstsdavc1HH9avcCMgMPfxrhcolrnclxpaspbtrtq+Vsttsstss3estscstszOBNC7468x69G8BClAiBBKGHAEArLiDGuc=
It has some of the first characters that I can see in the correct file:
Any one knows what's going on and how to fix it?
I did manage to reproduce the issue locally:
import requests
import base64
url = 'to a video file...'
r = requests.get(url)
with open("test.mp4", "wb") as out_file:
#reproducing the issue with this
base64_bytes = base64.b64encode(r.content)
#uncomment this will produce correct output
#message_bytes = base64.b64decode(base64_bytes)
out_file.write(message_bytes)
Ok I found the issue and added this to my serverless.yml
provider:
name: aws
runtime: python3.9
### fix:
apiGateway:
binaryMediaTypes:
- '*/*'
###
source:
https://github.com/dherault/serverless-offline/issues/464

Invoke endpoint error - detectron2 on AWS Sagemaker: ValueError: Type [application/x-npy] not support this type yet

I have been following this guide for implementing a Detectron2 model on Sagemaker.
It all looks good, both on the training and the batch transform side.
However, I tried to tweak a bit the code to create an Endpoint that can be invoked by sending a payload, and I am having some troubles with it.
At the end of this notebook, after creating the SageMaker model object:
model = PyTorchModel(
name="d2-sku110k-model",
model_data=training_job_artifact,
role=role,
sagemaker_session=sm_session,
entry_point="predict_sku110k.py",
source_dir="container_serving",
image_uri=serve_image_uri,
framework_version="1.6.0",
code_location=f"s3://{bucket}/{prefix_code}",
)
I added the following code:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')
And I can see that the model has been successfully deployed.
However, when I try to predict an image with :
predictor.predict(input)
I get the following error:
ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (500) from primary with message "Type [application/x-npy] not support this type yet
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 126, in transform
result = self._transform_fn(self._model, input_data, content_type, accept)
File "/opt/conda/lib/python3.6/site-packages/sagemaker_inference/transformer.py", line 215, in _default_transform_fn
data = self._input_fn(input_data, content_type)
File "/opt/ml/model/code/predict_sku110k.py", line 98, in input_fn
raise ValueError(err_msg)
ValueError: Type [application/x-npy] not support this type yet
I tried a bunch of different input types: a image byte-encoded (created with cv2.imencode('.jpg', cv_img)[1].tobytes()), a numpy array, a BytesIO object (created with io module), a dictionary of the form {'input': image} where image is any of the previous (this is because this format was used by a tensorflow endpoint I created some time ago).
As I think it might be relevant, I also copy paste here the Inference script used as entry point:
"""Code used for sagemaker batch transform jobs"""
from typing import BinaryIO, Mapping
import json
import logging
import sys
from pathlib import Path
import numpy as np
import cv2
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import CfgNode
##############
# Macros
##############
LOGGER = logging.Logger("InferenceScript", level=logging.INFO)
HANDLER = logging.StreamHandler(sys.stdout)
HANDLER.setFormatter(logging.Formatter("%(levelname)s | %(name)s | %(message)s"))
LOGGER.addHandler(HANDLER)
##########
# Deploy
##########
def _load_from_bytearray(request_body: BinaryIO) -> np.ndarray:
npimg = np.frombuffer(request_body, np.uint8)
return cv2.imdecode(npimg, cv2.IMREAD_COLOR)
def model_fn(model_dir: str) -> DefaultPredictor:
r"""Load trained model
Parameters
----------
model_dir : str
S3 location of the model directory
Returns
-------
DefaultPredictor
PyTorch model created by using Detectron2 API
"""
path_cfg, path_model = None, None
for p_file in Path(model_dir).iterdir():
if p_file.suffix == ".json":
path_cfg = p_file
if p_file.suffix == ".pth":
path_model = p_file
LOGGER.info(f"Using configuration specified in {path_cfg}")
LOGGER.info(f"Using model saved at {path_model}")
if path_model is None:
err_msg = "Missing model PTH file"
LOGGER.error(err_msg)
raise RuntimeError(err_msg)
if path_cfg is None:
err_msg = "Missing configuration JSON file"
LOGGER.error(err_msg)
raise RuntimeError(err_msg)
with open(str(path_cfg)) as fid:
cfg = CfgNode(json.load(fid))
cfg.MODEL.WEIGHTS = str(path_model)
cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
return DefaultPredictor(cfg)
def input_fn(request_body: BinaryIO, request_content_type: str) -> np.ndarray:
r"""Parse input data
Parameters
----------
request_body : BinaryIO
encoded input image
request_content_type : str
type of content
Returns
-------
np.ndarray
input image
Raises
------
ValueError
ValueError if the content type is not `application/x-image`
"""
if request_content_type == "application/x-image":
np_image = _load_from_bytearray(request_body)
else:
err_msg = f"Type [{request_content_type}] not support this type yet"
LOGGER.error(err_msg)
raise ValueError(err_msg)
return np_image
def predict_fn(input_object: np.ndarray, predictor: DefaultPredictor) -> Mapping:
r"""Run Detectron2 prediction
Parameters
----------
input_object : np.ndarray
input image
predictor : DefaultPredictor
Detectron2 default predictor (see Detectron2 documentation for details)
Returns
-------
Mapping
a dictionary that contains: the image shape (`image_height`, `image_width`), the predicted
bounding boxes in format x1y1x2y2 (`pred_boxes`), the confidence scores (`scores`) and the
labels associated with the bounding boxes (`pred_boxes`)
"""
LOGGER.info(f"Prediction on image of shape {input_object.shape}")
outputs = predictor(input_object)
fmt_out = {
"image_height": input_object.shape[0],
"image_width": input_object.shape[1],
"pred_boxes": outputs["instances"].pred_boxes.tensor.tolist(),
"scores": outputs["instances"].scores.tolist(),
"pred_classes": outputs["instances"].pred_classes.tolist(),
}
LOGGER.info(f"Number of detected boxes: {len(fmt_out['pred_boxes'])}")
return fmt_out
# pylint: disable=unused-argument
def output_fn(predictions, response_content_type):
r"""Serialize the prediction result into the desired response content type"""
return json.dumps(predictions)
Can anyone point out what is the correct format for invoking the model (or how to tweak the code to use the endpoint)? I am thinking to change the request_content_type to 'application/json', but I am not sure that it will help much.
Edit: I tried a solution inspired by this SO thread but it did not work for my case.
It's been a while since you asked this so I hope you found a solution already, but for people seeing this in the future ...
The error appears to be because you are sending the request with the default content_type (no specified a content type in the request, neither specified a serialiser), but your code is made in a way that will only respond to requests that come with content type "application/x-image"
The default content-type is "application/json"
You have 2 options here, you either amend your code to be able to handle "application/json" content type, or when you invoke the endpoint, you add a content-type header with the right value. You could do this by changing the predict method as below:
instead of:
predictor.predict(input)
try:
predictor.predict(input, initial_args={"ContentType":"application/x-image"})

Google Cloud Pus/Sub :: google.api_core.exceptions.DeadlineExceeded: 504 Deadline Exceeded

I was testing streaming processing of google cloud pub/sub.
Forward message from publisher to topic, reading the message on the pub/sub on apache-beam and checking it with beam.Map(print).
Reading messages from the pub/sub, it worked. But, an error occurred after reading the messages all.
ㅡ. This code delivers messages from publisher to topic
from google.cloud import pubsub_v1
from google.cloud import bigquery
import time
# TODO(developer)
project_id = [your-project-id]
topic_id = [your-topic-id]
# Construct a BigQuery client object.
client = bigquery.Client()
# Configure the batch to publish as soon as there is ten messages,
# one kilobyte of data, or one second has passed.
batch_settings = pubsub_v1.types.BatchSettings(
max_messages=10, # default 100
max_bytes=1024, # default 1 MB
max_latency=1, # default 10 ms'
)
publisher = pubsub_v1.PublisherClient(batch_settings)
topic_path = publisher.topic_path(project_id, topic_id)
query = """
SELECT *
FROM `[bigquery-schema.bigquery-dataset.bigquery-tablename]`
LIMIT 20
"""
query_job = client.query(query)
# Resolve the publish future in a separate thread.
def callback(topic_message):
message_id = topic_message.result()
print(message_id)
print("The query data:")
for row in query_job:
data = u"category={}, language={}, count={}".format(row[0], row[1], row[2])
print(data)
data = data.encode("utf-8")
time.sleep(1)
topic_message = publisher.publish(topic_path, data=data)
topic_message.add_done_callback(callback)
print("Published messages with batch settings.")
ㅡ. Apache-beam code [for reading and processing data from pub/sub]
# Copyright 2019 Google LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START pubsub_to_gcs]
import argparse
import datetime
import json
import logging
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import apache_beam.transforms.window as window
pipeline_options = PipelineOptions(
streaming=True,
save_main_session=True,
runner='DirectRunner',
return_immediately=True,
initial_rpc_timeout_millis=25000,
)
class GroupWindowsIntoBatches(beam.PTransform):
"""A composite transform that groups Pub/Sub messages based on publish
time and outputs a list of dictionaries, where each contains one message
and its publish timestamp.
"""
def __init__(self, window_size):
# Convert minutes into seconds.
self.window_size = int(window_size * 60)
def expand(self, pcoll):
return (
pcoll
# Assigns window info to each Pub/Sub message based on its
# publish timestamp.
| "Window into Fixed Intervals"
>> beam.WindowInto(window.FixedWindows(self.window_size))
| "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
# Use a dummy key to group the elements in the same window.
# Note that all the elements in one window must fit into memory
# for this. If the windowed elements do not fit into memory,
# please consider using `beam.util.BatchElements`.
# https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
| "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
| "Groupby" >> beam.GroupByKey()
| "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
)
class AddTimestamps(beam.DoFn):
def process(self, element, publish_time=beam.DoFn.TimestampParam):
"""Processes each incoming windowed element by extracting the Pub/Sub
message and its publish timestamp into a dictionary. `publish_time`
defaults to the publish timestamp returned by the Pub/Sub server. It
is bound to each element by Beam at runtime.
"""
yield {
"message_body": element.decode("utf-8"),
"publish_time": datetime.datetime.utcfromtimestamp(
float(publish_time)
).strftime("%Y-%m-%d %H:%M:%S.%f"),
}
class WriteBatchesToGCS(beam.DoFn):
def __init__(self, output_path):
self.output_path = output_path
def process(self, batch, window=beam.DoFn.WindowParam):
"""Write one batch per file to a Google Cloud Storage bucket. """
ts_format = "%H:%M"
window_start = window.start.to_utc_datetime().strftime(ts_format)
window_end = window.end.to_utc_datetime().strftime(ts_format)
filename = "-".join([self.output_path, window_start, window_end])
with beam.io.gcp.gcsio.GcsIO().open(filename=filename, mode="w") as f:
for element in batch:
f.write("{}\n".format(json.dumps(element)).encode("utf-8"))
class test_func(beam.DoFn) :
def __init__(self, delimiter=','):
self.delimiter = delimiter
def process(self, topic_message):
print(topic_message)
def run(input_topic, output_path, window_size=1.0, pipeline_args=None):
# `save_main_session` is set to true because some DoFn's rely on
# globally imported modules.
pipeline_options = PipelineOptions(
pipeline_args, streaming=True, save_main_session=True
)
with beam.Pipeline(options=pipeline_options) as pipeline:
(
pipeline
| "Read PubSub Messages"
>> beam.io.ReadFromPubSub(topic=input_topic)
| "Pardo" >> beam.ParDo(test_func(','))
)
if __name__ == "__main__": # noqa
input_topic = 'projects/[project-id]/topics/[pub/sub-name]'
output_path = 'gs://[bucket-name]/[file-directory]'
run(input_topic, output_path, 2)
# [END pubsub_to_gcs]
As a temporary measure, I set return_immediately=True. but, This is not a fundamental solution either.
Thank you for reading it.
This seems to be a known issue of the PubSub libraries reported in other SO thread and it looks that it was recently addressed with version 1.4.2 but not yet included in the BEAM dependencies that's still using google-cloud-pubsub>=0.39.0,<1.1.0.
I made some research and found that DataflowRunner appears to handle this error better than DirectRunner, which is maintained by Apache Beam team. The issue has been already reported on beam site, and it's not resolved yet.
Also please be advised that the troubleshooting guide for DEADLINE_EXCEEDED error can be found here. You can check if any of the presented advices could help in your case, such as upgrading to the latest version of the client library.

Google Cloud Speech-to-Text AP

I am using Google Cloud Speech-to-Text AP and trying to transcribe long audio file.However the audio file from the bucket cannot be detected.
I get an error stating :IOError: [Errno 2] No such file or directory:
def transcribe_gcs(gcs_uri):
time(gcs_uri)
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16000,
language_code='en-US')
operation = client.long_running_recognize(config, audio)
print('Waiting for operation to complete...')
response = operation.result(timeout=90)
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response.results:
# The first alternative is the most likely one for this portion.
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
print('Confidence: {}'.format(result.alternatives[0].confidence))
Try this
import requests
import json
url = "https://speech.googleapis.com/v1/speech:longrunningrecognize?key=<apiaccesskey>"
payload = {"config": {"encoding": "LINEAR16","sample_rate_hertz": 8000,
"language_code": "en-IN"},
"audio": {"uri": "gs://bucketname/file.flac"}}
r = requests.post(url, data=json.dumps(payload))
json_resp = r.json()
token_resp=json_resp['name']
url = "https://speech.googleapis.com/v1/operations/" + str(token_resp) +
"?key=<apiacesskey>"
content_response = requests.get(url)
content_json = content_response.json()
Your response is in content_json variable.

Exporting Logs from Mailgun for LTS

Is there a way via the API to export Mailgun's logs to a local file for long term storage? We need to keep our mailing logs for over the 30 days Mailgun provides for.
Thanks!
You can only request 300 events at a time, so you'll have to continue fetching the next page until you run out of results. You can then do whatever you'd like with the log items, such as generate a csv, or add items in your database. Check out https://documentation.mailgun.com/en/latest/api-events.html#events for the API docs. Here's an example in Python:
import requests
import csv
from datetime import datetime, timedelta
DATETIME_FORMAT = '%d %B %Y %H:%M:%S -0000'
def get_logs(start_date, end_date, next_url=None):
if next_url:
logs = requests.get(next_url,auth=("api", [YOUR MAILGUN ACCESS KEY]))
else:
logs = requests.get(
'https://api.mailgun.net/v3/{0}/events'.format(
[YOUR MAILGUN SERVER NAME]
),
auth=("api", [YOUR MAILGUN ACCESS KEY]),
params={"begin" : start_date.strftime(DATETIME_FORMAT),
"end" : end_date.strftime(DATETIME_FORMAT),
"ascending" : "yes",
"pretty" : "yes",
"limit" : 300,
"event" : "accepted",}
)
return logs.json()
start = datetime.now() - timedelta(2)
end = timezone.now() - timedelta(1)
log_items = []
current_page = get_logs(start, end)
while current_page.get('items'):
items = current_page.get('items')
log_items.extend(items)
next_url = current_page.get('paging').get('next', None)
current_page = get_logs(start, end, next_url=next_url)
keys = log_items[0].keys()
with open('mailgun{0}.csv'.format(start.strftime('%Y-%M-%d')), 'wb') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(log_items)
There's a simple python script to retrieve logs for a domain, however i haven't checked if it hits the events api instead of the now deprecated logs api...
https://github.com/getupcloud/python-mailgunlog
The original answer doesn't work without modifications. Here is the updated code that works:
#!/usr/bin/env python3
# Uses the Mailgun API to save logs to JSON file
# Set environment variables MAILGUN_API_KEY and MAILGUN_SERVER
# Optionally set MAILGUN_LOG_DAYS to number of days to retrieve logs for
# Based on https://stackoverflow.com/a/49825979
# See API guide https://documentation.mailgun.com/en/latest/api-intro.html#introduction
import os
import json
import requests
from datetime import datetime, timedelta
from email import utils
DAYS_TO_GET = os.environ.get("MAILGUN_LOG_DAYS", 7)
MAILGUN_API_KEY = os.environ.get("MAILGUN_API_KEY")
MAILGUN_SERVER = os.environ.get("MAILGUN_SERVER")
if not MAILGUN_API_KEY or not MAILGUN_SERVER:
print("Set environment variable MAILGUN_API_KEY and MAILGUN_SERVER")
exit(1)
ITEMS_PER_PAGE = 300 # API is limited to 300
def get_logs(start_date, next_url=None):
if next_url:
print(f"Getting next batch of {ITEMS_PER_PAGE} from {next_url}...")
response = requests.get(next_url,auth=("api", MAILGUN_API_KEY))
else:
url = 'https://api.mailgun.net/v3/{0}/events'.format(MAILGUN_SERVER)
start_date_formatted = utils.format_datetime(start_date) # Mailgun wants it in RFC 2822
print(f"Getting first batch of {ITEMS_PER_PAGE} from {url} since {start_date_formatted}...")
response = requests.get(
url,
auth=("api", MAILGUN_API_KEY),
params={"begin" : start_date_formatted,
"ascending" : "yes",
"pretty" : "yes",
"limit" : ITEMS_PER_PAGE,
"event" : "accepted",}
)
response.raise_for_status()
return response.json()
start = datetime.now() - timedelta(DAYS_TO_GET)
log_items = []
current_page = get_logs(start)
while current_page.get('items'):
items = current_page.get('items')
log_items.extend(items)
print(f"Retrieved {len(items)} records for a total of {len(log_items)}")
next_url = current_page.get('paging').get('next', None)
current_page = get_logs(start, next_url=next_url)
file_out = f"mailgun-logs-{MAILGUN_SERVER}_{start.strftime('%Y-%m-%d')}_to_{datetime.now().strftime('%Y-%m-%d')}.json"
print(f"Writing out {file_out}")
with open(file_out, 'w') as file_out_handle:
json.dump(log_items, file_out_handle, indent=4)
print("Done.")
You can have a look at MailgunLogger.
It's an open source project that can easily be deployed via Docker to fetch and store Mailgun events in a database. It features a dead simple, although rudimentary, search and allows you to add multiple accounts/domains.
Run via Docker:
docker run -d -p 5050:5050 \
-e "ML_DB_USER=username" \
-e "ML_DB_PASSWORD=password" \
-e "ML_DB_NAME=mailgun_logger" \
-e "ML_DB_HOST=my_db_host" \
--name mailgun_logger jackjoe/mailgun_logger
From there on, the interface guides you to configure everything.
In the OP case, this project can be used in a more headless fashion where you only use the database instead of the provided UI.
You can use Skyvia for exporting logs from Mailgun for LTS. Skyvia is a cloud tool for automatic Mailgun CSV import/export with powerful transformations. You can also export Mailgun ListMembers, Templates, Tags, etc. to CSV automatically on a schedule.