I intend to use Google Cloud Speech Transcription for Video Intelligence. The following code only analysis for a partial segment of the video.
video_uri = "gs://cloudmleap/video/next/JaneGoodall.mp4"
language_code = "en-GB"
segment = types.VideoSegment()
response = transcribe_speech(video_uri, language_code, [segment])
def transcribe_speech(video_uri, language_code, segments=None):
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [enums.Feature.SPEECH_TRANSCRIPTION]
config = types.SpeechTranscriptionConfig(
context = types.VideoContext(
print(f'Processing video "{video_uri}"...')
operation = video_client.annotate_video(
return operation.result()
How can I automatically analyse the whole video rather than defining a particular segment ?
You can follow this tutorial in Video Intelligence google doc. This tutorial shows how to transcribe a whole video. Your input should be stored in a GCS bucket and I see that in your sample code, your video is indeed stored in a GCS bucket so you should not have any issues with this.
Just make sure that you have installed the latest Video Intelligence library.
pip install --upgrade google-cloud-videointelligence
Here is the the code snippet from the Video Intelligence doc for transcribing audio:
"""Transcribe speech from a video stored on GCS."""
from google.cloud import videointelligence
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]
config = videointelligence.SpeechTranscriptionConfig(
language_code="en-US", enable_automatic_punctuation=True
video_context = videointelligence.VideoContext(speech_transcription_config=config)
operation = video_client.annotate_video(
"features": features,
"input_uri": path,
"video_context": video_context,
print("\nProcessing video for speech transcription.")
result = operation.result(timeout=600)
# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:
# The number of alternatives for each transcription is limited by
# SpeechTranscriptionConfig.max_alternatives.
# Each alternative is a different possible transcription
# and has its own confidence score.
for alternative in speech_transcription.alternatives:
print("Alternative level information:")
print("Transcript: {}".format(alternative.transcript))
print("Confidence: {}\n".format(alternative.confidence))
print("Word level information:")
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
"\t{}s - {}s: {}".format(
start_time.seconds + start_time.microseconds * 1e-6,
end_time.seconds + end_time.microseconds * 1e-6,
I'm trying to convert an audio file to linear 16 format using FFmpeg module. I've stored the audio file in one cloud storage bucket and want to move the converted file to a different bucket. The code works perfectly in VS code and deploys successfully to cloud functions. But, fails with [Errno 30] Read-only file system when run on the cloud.
Here's the code
from google.cloud import speech
from google.cloud import storage
import ffmpeg
import sys
out_bucket = 'encoded_audio_landing'
input_bucket_name = 'audio_landing'
def process_audio(input_bucket_name, in_filename, out_bucket):
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
in_filename: a gsk call audio file
returns an audio file encoded so that google speech to text api can transcribe
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
out, err = (
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
up_bucket = storage_client.bucket(out_bucket)
up_blob = up_bucket.blob(blob.name)
#print('type / len out', type(out), len(out))
#delete source file
def hello_gcs(event, context):
"""Background Cloud Function to be triggered by Cloud Storage.
This generic function logs relevant data when a file is changed,
and works for all Cloud Storage CRUD operations.
event (dict): The dictionary with data specific to this type of event.
The `data` field contains a description of the event in
the Cloud Storage `object` format described here:
context (google.cloud.functions.Context): Metadata of triggering event.
None; the output is written to Cloud Logging
#print('Event ID: {}'.format(context.event_id))
#print('Event type: {}'.format(context.event_type))
print('Bucket: {}'.format(event['bucket']))
print('File: {}'.format(event['name']))
print('Metageneration: {}'.format(event['metageneration']))
#print('Created: {}'.format(event['timeCreated']))
#print('Updated: {}'.format(event['updated']))
#convert audio encoding
print('begin process_audio')
process_audio(input_bucket_name, event['name'], out_bucket)
The problem was that I was downloading the file to my local directory, which obviously wouldn't work on the cloud. I read another article where someone used added the get file path function and used that as an input into blob.download_tofilename(). I'm not sure why that worked.
I did try just removing the whole download_tofilename bit, but it didn't work without that.
I'd very much appreciate an explanation if someone knows why
#this gets around downloading the file to a local folder. it creates some sort of templ location
def get_file_path(filename):
file_name = secure_filename(filename)
return os.path.join(tempfile.gettempdir(), file_name)
def process_audio(input_bucket_name, in_filename, out_bucket):
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
in_filename: a gsk call audio file
input_bucket_name: location of the sourcefile that needs to be re-encoded
out_bucket: where to put the newly encoded file
returns an audio file encoded so that google speech to text api can transcribe
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
#creates some sort of temp loaction for the tile
file_path = get_file_path(blob.name)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
#envokes the ffmpeg library to re-encode the audio file, it's actually some sort of command line application
# that is available in Python and google cloud. The things in the .outuput bit are options from ffmpeg, you
# pass these options into ffmpeg there
out, err = (
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
I am using OCR with TEXT_DETECTION and DOCUMENT_TEXT_DETECTION to process pdf file(InputConfig mimeType- "application/pdf"). Currently images are getting skipped while processing. Is there any possible way to process images(having text) in PDF file?
To answer your question, yes there is a way to process images with text in PDF files. According to Google official documentation, it is normally by using OCR DOCUMENT_TEXT_DETECTION [1].
The Vision API can detect and transcribe text from PDF and TIFF files stored in Cloud Storage. Document text detection from PDF and TIFF must be requested using the files:asyncBatchAnnotate function, which performs an offline (asynchronous) request and provides its status using the operations resources. The output from a PDF/TIFF request is written to a JSON file created in the specified Cloud Storage bucket.[2]
I don't know what language you are using but I tried this python code and it processes a pdf with images without skipping them.
You need to install google-cloud-storage and google-cloud-vision.
On gcs_source_uri you have to specify your bucket name and your pdf file that you are using.
On gcs_destination_uri you only have to specify your bucket name let pdf_result as it is.
import os
import re
import json
from google.cloud import vision
from google.cloud import storage
#pip install --upgrade google-cloud-storage
#pip install --upgrade google-cloud-vision
credential_path = 'your_path'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path
client = vision.ImageAnnotatorClient()
batch_size = 2
mime_type = 'application/pdf'
feature = vision.Feature(
gcs_source_uri= 'gs://your_bucketname/your_pdf_File.pdf'
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination_uri = 'gs://your_bucketname/pdf_result'
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(gcs_destination=gcs_destination, batch_size= batch_size)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature], input_config=input_config, output_config=output_config
operation = client.async_batch_annotate_files(requests=[async_request])
storage_client = storage.Client()
match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)
bucket = storage_client.get_bucket(bucket_name)
#List object with the given prefix
blob_list = list(bucket.list_blobs(prefix=prefix))
print('Output files: ')
for blob in blob_list:
output = blob_list[0]
json_string = output.download_as_string()
response = json.loads(json_string)
first_page_response = response['responses'][0]
annotation = first_page_response['fullTextAnnotation']
print('Full text:\n')
I am new to Google Cloud's Text-to-speech. The docs show the <prosody> tag with rate and pitch attributes. But these do not make a difference in my requests. For example, if I use rate="slow" or rate="fast", or pitch="+2st" or pitch="-2st", the result is the same and different from the example on the docs, which has a slower rate and lower tone.
I ensured the latest version with:
python3 -m pip install --upgrade google-cloud-texttospeech
Minimal reproducible example:
import os
from google.cloud import texttospeech
AUDIO_CONFIG = texttospeech.AudioConfig(
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/file"
tts_client = texttospeech.TextToSpeechClient()
voice = texttospeech.VoiceSelectionParams(
name= "en-US-Wavenet-A"
ssml_input = texttospeech.SynthesisInput(
ssml='<prosody rate="fast" pitch="+2st">Can you hear me now?</prosody>'
# or this one:
#ssml='<prosody rate="slow" pitch="-2st">Can you hear me now?</prosody>'
response = tts_client.synthesize_speech(
input=ssml_input, voice=voice, audio_config=AUDIO_CONFIG
with open("/tmp/cloud.wav", 'wb') as out:
# Write the response to the output file.
How can I use Google Cloud's rate and pitch prosody attributes?
According to this document, when you are writing a SSML script inside Text-to-Speech code, the format for the SSML script should be like :
<prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody>
You can refer to the below mentioned piece of code, I tried at my end and it is working for me.
Code 1 :
I used pitch as low and rate as slow .
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody></speak>'
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
print('Audio content written to file "output.mp3"')
Audio output : output audio
Code 2 :
I used a rate as fast and pitch as +5st.
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="fast" pitch="+5st">Hi good morning have a nice day</prosody></speak>'
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
print('Audio content written to file "output.mp3"')
Audio output : output audio
I am using Google Cloud Speech like the following:
def transcribe_file_with_word_time_offsets(speech_files):
"""Transcribe the given audio file synchronously and output the word time
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
files = sorted(glob.glob(speech_files))
starting_number = 0
filenames_and_text = []
for f in files:
with io.open(f, 'rb') as audio_file:
content = audio_file.read()
audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
response = client.recognize(config , audio)
This won't result in an error, just hang forever when it gets to the response = client.recognize(config , audio) line. I had used this exact script before without issues but now it hasn't worked for a long time.
[short summary: how to use TF high-level Estimator on Python with an external file reader? or with feed_dict?]
Been struggling with this for few days, couldn't find any solution on-line...
I'm using TF high-level modules (tf.contrib.learn.Estimator on tf1.0, or tf.estimator.Estimator on tf1.1),
features and targets (x/y) inputted through an input_fn, and the graph built on the model_fn.
Already trained a nn on 'small' data sets, in which the whole input is the part of the graph, using slice_input_producer etc. (I can push an example to github if it serves ppl here).
I try to train a larger nn on 'heavier' data-sets (10s-100s GB).
I have an external Python reader that does some nasty binary file reading, which I really don't want to get into.
This reader has its own queue.Queue with m1 samples. When I use it to extract the m1 {features} & {targets}, the net simply saves all these samples as const. in the first layer of the graph... completely undesired.
I try to either -
feed the output of the external file reader as input to my graph.
define a proper tf queue object that will keep updating the queue (each time a sample is dequeued, i want a completely other sample to be enqueued).
Reminding that I use the "high level", e.g.
self.Estimator = tf.contrib.learn.Estimator(
config=tf.contrib.learn.RunConfig( ... ) )
def input_fn(self, mode):
batch_data = self.data[mode].next() # pops out a batch of samples, as numpy 4D matrices
... # some processing of batch data
features_dict = dict(data=batch_data.pop('data'))
targets_dict = batch_data
return features_dict, targets_dict
self.Estimator.fit(input_fn=lambda: self.input_fn(modekeys.TRAIN))
Attached is a final solution for integrating an external reader into the high-level TF api (tf.contrib.learn.Estimator / tf.estimator.Estimator).
Please note:
the architecture and "logic" is not important. it's a stupid simple net.
the external reader outputs a dictionary of numpy matrices.
the input_fn is using this reader.
In order to verify that the reader "pulls new values", I both
save the recent value to self.status (should be > 1.0)
save a summary, to be viewed in tensorboard.
Code example is in gist, and below.
import tensorflow as tf
import numpy as np
modekeys = tf.contrib.learn.ModeKeys
# Tested on python 2.7.9, tf 1.1.0
class inputExample:
def __init__(self):
self.status = 0.0 # tracing which value was recently 'pushed' to the net
self.model_dir = 'temp_dir'
def input_fn(self):
# returns features and labels dictionaries as expected by tf Estimator's model_fn
data, labels = tf.py_func(func=self.input_fn_np, inp=[], Tout=[tf.float32, tf.float32], stateful=True)
data.set_shape([1,3,3,1]) # shapes are unknown and need to be set for integrating into the network
return dict(data=data), dict(labels=labels)
def input_fn_np(self):
# returns a dictionary of numpy matrices
batch_data = self.reader()
return batch_data['data'], batch_data['labels']
def model_fn(self, features, labels, mode):
# using tf 2017 convention of dictionaries of features/labels as inputs
features_in = features['data']
labels_in = labels['labels']
pred_layer = tf.layers.conv2d(name='pred', inputs=features_in, filters=1, kernel_size=3)
tf.summary.scalar(name='label', tensor=tf.squeeze(labels_in))
tf.summary.scalar(name='pred', tensor=tf.squeeze(pred_layer))
loss = None
if mode != modekeys.INFER:
loss = tf.losses.mean_squared_error(labels=labels_in, predictions=pred_layer)
train_op = None
if mode == modekeys.TRAIN:
train_op = tf.contrib.layers.optimize_loss(
learning_rate = 0.01,
optimizer = 'SGD',
global_step = tf.contrib.framework.get_global_step()
predictions = {'estim_exp': pred_layer}
return tf.contrib.learn.ModelFnOps(mode=mode, predictions=predictions, loss=loss, train_op=train_op)
def reader(self):
self.status += 1
if self.status > 1000.0:
self.status = 1.0
return dict(
data = np.random.randn(1,3,3,1).astype(dtype=np.float32),
labels = np.sin(np.ones([1,1,1,1], dtype=np.float32)*self.status)
def get_estimator(self):
self.Estimator = tf.contrib.learn.Estimator(
model_fn = self.model_fn,
model_dir = self.model_dir,
config = tf.contrib.learn.RunConfig(
save_checkpoints_steps = 10,
save_summary_steps = 10,
save_checkpoints_secs = None
if __name__ == '__main__':
ex = inputExample()
You can use tf.constant if you have the training data already in python memory as shown in the abalone TF example: https://github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/examples/tutorials/estimators/abalone.py#L138-L141
Note: copying the data from disk to Python to TensorFlow is often less efficient than constructing an input pipeline in TensorFlow (i.e. loading data from disk directly into TensorFlow Tensors), such as using tf.contrib.learn.datasets.base.load_csv_without_header.