Google cloud speech never gives a response - google-cloud-platform

I am using Google Cloud Speech like the following:
def transcribe_file_with_word_time_offsets(speech_files):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
files = sorted(glob.glob(speech_files))
starting_number = 0
filenames_and_text = []
for f in files:
with io.open(f, 'rb') as audio_file:
content = audio_file.read()
audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=22050,
language_code='en-UK')
response = client.recognize(config , audio)
This won't result in an error, just hang forever when it gets to the response = client.recognize(config , audio) line. I had used this exact script before without issues but now it hasn't worked for a long time.

Related

Re-encoding audio file to linear16 for google cloud speech api fails with '[Errno 30] Read-only file system'

I'm trying to convert an audio file to linear 16 format using FFmpeg module. I've stored the audio file in one cloud storage bucket and want to move the converted file to a different bucket. The code works perfectly in VS code and deploys successfully to cloud functions. But, fails with [Errno 30] Read-only file system when run on the cloud.
Here's the code
from google.cloud import speech
from google.cloud import storage
import ffmpeg
import sys
out_bucket = 'encoded_audio_landing'
input_bucket_name = 'audio_landing'
def process_audio(input_bucket_name, in_filename, out_bucket):
'''
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
Params:
in_filename: a gsk call audio file
returns an audio file encoded so that google speech to text api can transcribe
'''
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
blob.download_to_filename(blob.name)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
try:
out, err = (
ffmpeg.input(blob.name)
#ffmpeg.input()
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
sys.exit(1)
up_bucket = storage_client.bucket(out_bucket)
up_blob = up_bucket.blob(blob.name)
#print('type / len out', type(out), len(out))
up_blob.upload_from_string(out)
#delete source file
blob.delete()
def hello_gcs(event, context):
"""Background Cloud Function to be triggered by Cloud Storage.
This generic function logs relevant data when a file is changed,
and works for all Cloud Storage CRUD operations.
Args:
event (dict): The dictionary with data specific to this type of event.
The `data` field contains a description of the event in
the Cloud Storage `object` format described here:
https://cloud.google.com/storage/docs/json_api/v1/objects#resource
context (google.cloud.functions.Context): Metadata of triggering event.
Returns:
None; the output is written to Cloud Logging
"""
#print('Event ID: {}'.format(context.event_id))
#print('Event type: {}'.format(context.event_type))
print('Bucket: {}'.format(event['bucket']))
print('File: {}'.format(event['name']))
print('Metageneration: {}'.format(event['metageneration']))
#print('Created: {}'.format(event['timeCreated']))
#print('Updated: {}'.format(event['updated']))
#convert audio encoding
print('begin process_audio')
process_audio(input_bucket_name, event['name'], out_bucket)
The problem was that I was downloading the file to my local directory, which obviously wouldn't work on the cloud. I read another article where someone used added the get file path function and used that as an input into blob.download_tofilename(). I'm not sure why that worked.
I did try just removing the whole download_tofilename bit, but it didn't work without that.
I'd very much appreciate an explanation if someone knows why
#this gets around downloading the file to a local folder. it creates some sort of templ location
def get_file_path(filename):
file_name = secure_filename(filename)
return os.path.join(tempfile.gettempdir(), file_name)
def process_audio(input_bucket_name, in_filename, out_bucket):
'''
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
Params:
in_filename: a gsk call audio file
input_bucket_name: location of the sourcefile that needs to be re-encoded
out_bucket: where to put the newly encoded file
returns an audio file encoded so that google speech to text api can transcribe
'''
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
print(blob.name)
#creates some sort of temp loaction for the tile
file_path = get_file_path(blob.name)
blob.download_to_filename(file_path)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
#envokes the ffmpeg library to re-encode the audio file, it's actually some sort of command line application
# that is available in Python and google cloud. The things in the .outuput bit are options from ffmpeg, you
# pass these options into ffmpeg there
try:
out, err = (
ffmpeg.input(file_path)
#ffmpeg.input()
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
sys.exit(1)

Does google cloud vision api( source path- gcsSource) supports image detection (image contains text) in PDF file?

I am using OCR with TEXT_DETECTION and DOCUMENT_TEXT_DETECTION to process pdf file(InputConfig mimeType- "application/pdf"). Currently images are getting skipped while processing. Is there any possible way to process images(having text) in PDF file?
To answer your question, yes there is a way to process images with text in PDF files. According to Google official documentation, it is normally by using OCR DOCUMENT_TEXT_DETECTION [1].
The Vision API can detect and transcribe text from PDF and TIFF files stored in Cloud Storage. Document text detection from PDF and TIFF must be requested using the files:asyncBatchAnnotate function, which performs an offline (asynchronous) request and provides its status using the operations resources. The output from a PDF/TIFF request is written to a JSON file created in the specified Cloud Storage bucket.[2]
[1]https://cloud.google.com/vision/docs/ocr#optical_character_recognition_ocr
[2]https://cloud.google.com/vision/docs/pdf#vision_text_detection_pdf_gcs-gcloud
EDIT
I don't know what language you are using but I tried this python code and it processes a pdf with images without skipping them.
You need to install google-cloud-storage and google-cloud-vision.
On gcs_source_uri you have to specify your bucket name and your pdf file that you are using.
On gcs_destination_uri you only have to specify your bucket name let pdf_result as it is.
import os
import re
import json
from google.cloud import vision
from google.cloud import storage
"""
#pip install --upgrade google-cloud-storage
#pip install --upgrade google-cloud-vision
"""
credential_path = 'your_path'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
client = vision.ImageAnnotatorClient()
batch_size = 2
mime_type = 'application/pdf'
feature = vision.Feature(
type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source_uri= 'gs://your_bucketname/your_pdf_File.pdf'
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination_uri = 'gs://your_bucketname/pdf_result'
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size= batch_size)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config, output_config=output_config
)
operation = client.async_batch_annotate_files(requests=[async_request])
operation.result(timeout=180)
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name)
#List object with the given prefix
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files: ')
for blob in blob_list:
print(blob.name)
output = blob_list[0]
json_string = output.download_as_string()
response = json.loads(json_string)
first_page_response = response['responses'][0]
annotation = first_page_response['fullTextAnnotation']
print('Full text:\n')
print(annotation['text'])

Google Cloud's rate and pitch prosody attributes

I am new to Google Cloud's Text-to-speech. The docs show the <prosody> tag with rate and pitch attributes. But these do not make a difference in my requests. For example, if I use rate="slow" or rate="fast", or pitch="+2st" or pitch="-2st", the result is the same and different from the example on the docs, which has a slower rate and lower tone.
I ensured the latest version with:
python3 -m pip install --upgrade google-cloud-texttospeech
Minimal reproducible example:
import os
from google.cloud import texttospeech
AUDIO_CONFIG = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/file"
tts_client = texttospeech.TextToSpeechClient()
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name= "en-US-Wavenet-A"
)
ssml_input = texttospeech.SynthesisInput(
ssml='<prosody rate="fast" pitch="+2st">Can you hear me now?</prosody>'
# or this one:
#ssml='<prosody rate="slow" pitch="-2st">Can you hear me now?</prosody>'
)
response = tts_client.synthesize_speech(
input=ssml_input, voice=voice, audio_config=AUDIO_CONFIG
)
with open("/tmp/cloud.wav", 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)
How can I use Google Cloud's rate and pitch prosody attributes?
According to this document, when you are writing a SSML script inside Text-to-Speech code, the format for the SSML script should be like :
<speak>
<prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody>
</speak>
You can refer to the below mentioned piece of code, I tried at my end and it is working for me.
Code 1 :
I used pitch as low and rate as slow .
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody></speak>'
)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
Audio output : output audio
Code 2 :
I used a rate as fast and pitch as +5st.
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="fast" pitch="+5st">Hi good morning have a nice day</prosody></speak>'
)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
Audio output : output audio

Google Cloud Speech-to-Text AP

I am using Google Cloud Speech-to-Text AP and trying to transcribe long audio file.However the audio file from the bucket cannot be detected.
I get an error stating :IOError: [Errno 2] No such file or directory:
def transcribe_gcs(gcs_uri):
time(gcs_uri)
"""Asynchronously transcribes the audio file specified by the gcs_uri."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
audio = types.RecognitionAudio(uri=gcs_uri)
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=16000,
language_code='en-US')
operation = client.long_running_recognize(config, audio)
print('Waiting for operation to complete...')
response = operation.result(timeout=90)
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response.results:
# The first alternative is the most likely one for this portion.
print(u'Transcript: {}'.format(result.alternatives[0].transcript))
print('Confidence: {}'.format(result.alternatives[0].confidence))
Try this
import requests
import json
url = "https://speech.googleapis.com/v1/speech:longrunningrecognize?key=<apiaccesskey>"
payload = {"config": {"encoding": "LINEAR16","sample_rate_hertz": 8000,
"language_code": "en-IN"},
"audio": {"uri": "gs://bucketname/file.flac"}}
r = requests.post(url, data=json.dumps(payload))
json_resp = r.json()
token_resp=json_resp['name']
url = "https://speech.googleapis.com/v1/operations/" + str(token_resp) +
"?key=<apiacesskey>"
content_response = requests.get(url)
content_json = content_response.json()
Your response is in content_json variable.

Uploading video to YouTube and adding it to playlist using YouTube Data API v3 in Python

I wrote a script to upload a video to YouTube using YouTube Data API v3 in the python with help of example given in Example code.
And I wrote another script to add uploaded video to playlist using same YouTube Data API v3 you can be seen here
After that I wrote a single script to upload video and add that video to playlist. In that I took care of authentication and scops still I am getting permission error. here is my new script
#!/usr/bin/python
import httplib
import httplib2
import os
import random
import sys
import time
from apiclient.discovery import build
from apiclient.errors import HttpError
from apiclient.http import MediaFileUpload
from oauth2client.file import Storage
from oauth2client.client import flow_from_clientsecrets
from oauth2client.tools import run
# Explicitly tell the underlying HTTP transport library not to retry, since
# we are handling retry logic ourselves.
httplib2.RETRIES = 1
# Maximum number of times to retry before giving up.
MAX_RETRIES = 10
# Always retry when these exceptions are raised.
RETRIABLE_EXCEPTIONS = (httplib2.HttpLib2Error, IOError, httplib.NotConnected,
httplib.IncompleteRead, httplib.ImproperConnectionState,
httplib.CannotSendRequest, httplib.CannotSendHeader,
httplib.ResponseNotReady, httplib.BadStatusLine)
# Always retry when an apiclient.errors.HttpError with one of these status
# codes is raised.
RETRIABLE_STATUS_CODES = [500, 502, 503, 504]
CLIENT_SECRETS_FILE = "client_secrets.json"
# A limited OAuth 2 access scope that allows for uploading files, but not other
# types of account access.
YOUTUBE_UPLOAD_SCOPE = "https://www.googleapis.com/auth/youtube.upload"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
# Helpful message to display if the CLIENT_SECRETS_FILE is missing.
MISSING_CLIENT_SECRETS_MESSAGE = """
WARNING: Please configure OAuth 2.0
To make this sample run you will need to populate the client_secrets.json file
found at:
%s
with information from the APIs Console
https://code.google.com/apis/console#access
For more information about the client_secrets.json file format, please visit:
https://developers.google.com/api-client-library/python/guide/aaa_client_secrets
""" % os.path.abspath(os.path.join(os.path.dirname(__file__),
CLIENT_SECRETS_FILE))
def get_authenticated_service():
flow = flow_from_clientsecrets(CLIENT_SECRETS_FILE, scope=YOUTUBE_UPLOAD_SCOPE,
message=MISSING_CLIENT_SECRETS_MESSAGE)
storage = Storage("%s-oauth2.json" % sys.argv[0])
credentials = storage.get()
if credentials is None or credentials.invalid:
credentials = run(flow, storage)
return build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
http=credentials.authorize(httplib2.Http()))
def initialize_upload(title,description,keywords,privacyStatus,file):
youtube = get_authenticated_service()
tags = None
if keywords:
tags = keywords.split(",")
insert_request = youtube.videos().insert(
part="snippet,status",
body=dict(
snippet=dict(
title=title,
description=description,
tags=tags,
categoryId='26'
),
status=dict(
privacyStatus=privacyStatus
)
),
# chunksize=-1 means that the entire file will be uploaded in a single
# HTTP request. (If the upload fails, it will still be retried where it
# left off.) This is usually a best practice, but if you're using Python
# older than 2.6 or if you're running on App Engine, you should set the
# chunksize to something like 1024 * 1024 (1 megabyte).
media_body=MediaFileUpload(file, chunksize=-1, resumable=True)
)
vid=resumable_upload(insert_request)
#Here I added lines to add video to playlist
#add_video_to_playlist(youtube,vid,"PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ")
#youtube = get_authenticated_service()
add_video_request=youtube.playlistItems().insert(
part="snippet",
body={
'snippet': {
'playlistId': "PL2JW1S4IMwYubm06iDKfDsmWVB-J8funQ",
'resourceId': {
'kind': 'youtube#video',
'videoId': vid
}
#'position': 0
}
}
).execute()
def resumable_upload(insert_request):
response = None
error = None
retry = 0
vid=None
while response is None:
try:
print "Uploading file..."
status, response = insert_request.next_chunk()
if 'id' in response:
print "'%s' (video id: %s) was successfully uploaded." % (
title, response['id'])
vid=response['id']
else:
exit("The upload failed with an unexpected response: %s" % response)
except HttpError, e:
if e.resp.status in RETRIABLE_STATUS_CODES:
error = "A retriable HTTP error %d occurred:\n%s" % (e.resp.status,
e.content)
else:
raise
except RETRIABLE_EXCEPTIONS, e:
error = "A retriable error occurred: %s" % e
if error is not None:
print error
retry += 1
if retry > MAX_RETRIES:
exit("No longer attempting to retry.")
max_sleep = 2 ** retry
sleep_seconds = random.random() * max_sleep
print "Sleeping %f seconds and then retrying..." % sleep_seconds
time.sleep(sleep_seconds)
return vid
if __name__ == '__main__':
title="sample title"
description="sample description"
keywords="keyword1,keyword2,keyword3"
privacyStatus="public"
file="myfile.mp4"
vid=initialize_upload(title,description,keywords,privacyStatus,file)
print 'video ID is :',vid
I am not able to figure out what is wrong. I am getting permission error. both script works fine independently.
could anyone help me figure out where I am wrong or how to achieve uploading video and adding that too playlist.
I got the answer actually in both the independent script scope is different.
scope for uploading is "https://www.googleapis.com/auth/youtube.upload"
scope for adding to playlist is "https://www.googleapis.com/auth/youtube"
as scope is different so I had to handle authentication separately.