Google Cloud's rate and pitch prosody attributes - google-cloud-platform

I am new to Google Cloud's Text-to-speech. The docs show the <prosody> tag with rate and pitch attributes. But these do not make a difference in my requests. For example, if I use rate="slow" or rate="fast", or pitch="+2st" or pitch="-2st", the result is the same and different from the example on the docs, which has a slower rate and lower tone.
I ensured the latest version with:
python3 -m pip install --upgrade google-cloud-texttospeech
Minimal reproducible example:
import os
from google.cloud import texttospeech
AUDIO_CONFIG = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.LINEAR16)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/file"
tts_client = texttospeech.TextToSpeechClient()
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name= "en-US-Wavenet-A"
)
ssml_input = texttospeech.SynthesisInput(
ssml='<prosody rate="fast" pitch="+2st">Can you hear me now?</prosody>'
# or this one:
#ssml='<prosody rate="slow" pitch="-2st">Can you hear me now?</prosody>'
)
response = tts_client.synthesize_speech(
input=ssml_input, voice=voice, audio_config=AUDIO_CONFIG
)
with open("/tmp/cloud.wav", 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)
How can I use Google Cloud's rate and pitch prosody attributes?

According to this document, when you are writing a SSML script inside Text-to-Speech code, the format for the SSML script should be like :
<speak>
<prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody>
</speak>
You can refer to the below mentioned piece of code, I tried at my end and it is working for me.
Code 1 :
I used pitch as low and rate as slow .
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="slow" pitch="low">Hi good morning have a nice day</prosody></speak>'
)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
Audio output : output audio
Code 2 :
I used a rate as fast and pitch as +5st.
from google.cloud import texttospeech
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(
ssml= '<speak><prosody rate="fast" pitch="+5st">Hi good morning have a nice day</prosody></speak>'
)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
voice = texttospeech.VoiceSelectionParams(
language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
)
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Writes the synthetic audio to the output file.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print('Audio content written to file "output.mp3"')
Audio output : output audio

Related

Re-encoding audio file to linear16 for google cloud speech api fails with '[Errno 30] Read-only file system'

I'm trying to convert an audio file to linear 16 format using FFmpeg module. I've stored the audio file in one cloud storage bucket and want to move the converted file to a different bucket. The code works perfectly in VS code and deploys successfully to cloud functions. But, fails with [Errno 30] Read-only file system when run on the cloud.
Here's the code
from google.cloud import speech
from google.cloud import storage
import ffmpeg
import sys
out_bucket = 'encoded_audio_landing'
input_bucket_name = 'audio_landing'
def process_audio(input_bucket_name, in_filename, out_bucket):
'''
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
Params:
in_filename: a gsk call audio file
returns an audio file encoded so that google speech to text api can transcribe
'''
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
blob.download_to_filename(blob.name)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
try:
out, err = (
ffmpeg.input(blob.name)
#ffmpeg.input()
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
sys.exit(1)
up_bucket = storage_client.bucket(out_bucket)
up_blob = up_bucket.blob(blob.name)
#print('type / len out', type(out), len(out))
up_blob.upload_from_string(out)
#delete source file
blob.delete()
def hello_gcs(event, context):
"""Background Cloud Function to be triggered by Cloud Storage.
This generic function logs relevant data when a file is changed,
and works for all Cloud Storage CRUD operations.
Args:
event (dict): The dictionary with data specific to this type of event.
The `data` field contains a description of the event in
the Cloud Storage `object` format described here:
https://cloud.google.com/storage/docs/json_api/v1/objects#resource
context (google.cloud.functions.Context): Metadata of triggering event.
Returns:
None; the output is written to Cloud Logging
"""
#print('Event ID: {}'.format(context.event_id))
#print('Event type: {}'.format(context.event_type))
print('Bucket: {}'.format(event['bucket']))
print('File: {}'.format(event['name']))
print('Metageneration: {}'.format(event['metageneration']))
#print('Created: {}'.format(event['timeCreated']))
#print('Updated: {}'.format(event['updated']))
#convert audio encoding
print('begin process_audio')
process_audio(input_bucket_name, event['name'], out_bucket)
The problem was that I was downloading the file to my local directory, which obviously wouldn't work on the cloud. I read another article where someone used added the get file path function and used that as an input into blob.download_tofilename(). I'm not sure why that worked.
I did try just removing the whole download_tofilename bit, but it didn't work without that.
I'd very much appreciate an explanation if someone knows why
#this gets around downloading the file to a local folder. it creates some sort of templ location
def get_file_path(filename):
file_name = secure_filename(filename)
return os.path.join(tempfile.gettempdir(), file_name)
def process_audio(input_bucket_name, in_filename, out_bucket):
'''
converts audio encoding for GSK call center call recordings to linear16 encoding and 16,000
hertz sample rate
Params:
in_filename: a gsk call audio file
input_bucket_name: location of the sourcefile that needs to be re-encoded
out_bucket: where to put the newly encoded file
returns an audio file encoded so that google speech to text api can transcribe
'''
storage_client = storage.Client()
bucket = storage_client.bucket(input_bucket_name)
blob = bucket.blob(in_filename)
print(blob.name)
#creates some sort of temp loaction for the tile
file_path = get_file_path(blob.name)
blob.download_to_filename(file_path)
print('type contents: ', type('processedfile'))
#print('blob name / len / type', blob.name, len(blob.name), type(blob.name))
#envokes the ffmpeg library to re-encode the audio file, it's actually some sort of command line application
# that is available in Python and google cloud. The things in the .outuput bit are options from ffmpeg, you
# pass these options into ffmpeg there
try:
out, err = (
ffmpeg.input(file_path)
#ffmpeg.input()
.output('pipe: a', format="s16le", acodec="pcm_s16le", ac=1, ar="16k")
.overwrite_output()
.run(capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
print(e.stderr, file=sys.stderr)
sys.exit(1)

Google Cloud Speech Transcription for Video Intelligence

I intend to use Google Cloud Speech Transcription for Video Intelligence. The following code only analysis for a partial segment of the video.
video_uri = "gs://cloudmleap/video/next/JaneGoodall.mp4"
language_code = "en-GB"
segment = types.VideoSegment()
segment.start_time_offset.FromSeconds(55)
segment.end_time_offset.FromSeconds(80)
response = transcribe_speech(video_uri, language_code, [segment])
def transcribe_speech(video_uri, language_code, segments=None):
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [enums.Feature.SPEECH_TRANSCRIPTION]
config = types.SpeechTranscriptionConfig(
language_code=language_code,
enable_automatic_punctuation=True,
)
context = types.VideoContext(
segments=segments,
speech_transcription_config=config,
)
print(f'Processing video "{video_uri}"...')
operation = video_client.annotate_video(
input_uri=video_uri,
features=features,
video_context=context,
)
return operation.result()
How can I automatically analyse the whole video rather than defining a particular segment ?
You can follow this tutorial in Video Intelligence google doc. This tutorial shows how to transcribe a whole video. Your input should be stored in a GCS bucket and I see that in your sample code, your video is indeed stored in a GCS bucket so you should not have any issues with this.
Just make sure that you have installed the latest Video Intelligence library.
pip install --upgrade google-cloud-videointelligence
Here is the the code snippet from the Video Intelligence doc for transcribing audio:
"""Transcribe speech from a video stored on GCS."""
from google.cloud import videointelligence
path="gs://your_gcs_bucket/your_video.mp4"
video_client = videointelligence.VideoIntelligenceServiceClient()
features = [videointelligence.Feature.SPEECH_TRANSCRIPTION]
config = videointelligence.SpeechTranscriptionConfig(
language_code="en-US", enable_automatic_punctuation=True
)
video_context = videointelligence.VideoContext(speech_transcription_config=config)
operation = video_client.annotate_video(
request={
"features": features,
"input_uri": path,
"video_context": video_context,
}
)
print("\nProcessing video for speech transcription.")
result = operation.result(timeout=600)
# There is only one annotation_result since only
# one video is processed.
annotation_results = result.annotation_results[0]
for speech_transcription in annotation_results.speech_transcriptions:
# The number of alternatives for each transcription is limited by
# SpeechTranscriptionConfig.max_alternatives.
# Each alternative is a different possible transcription
# and has its own confidence score.
for alternative in speech_transcription.alternatives:
print("Alternative level information:")
print("Transcript: {}".format(alternative.transcript))
print("Confidence: {}\n".format(alternative.confidence))
print("Word level information:")
for word_info in alternative.words:
word = word_info.word
start_time = word_info.start_time
end_time = word_info.end_time
print(
"\t{}s - {}s: {}".format(
start_time.seconds + start_time.microseconds * 1e-6,
end_time.seconds + end_time.microseconds * 1e-6,
word,
)
)

How can I conver a transcribed .wav into txt in full extent. - Google Speech API

I'm having trouble with converting full transcribed speech to a text file. Eventually, I get what I need but not the entire text from the audio file. Let me note this (1 Pic), I can see the whole text when I use print() function but get only one line of that text when I try to write it to .txt file (2 Pic).
Also, you can look at my code if you need additional info and stuff. Thank you in advance!
from google.cloud import speech
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'PATH'
client = speech.SpeechClient()
with open('sample.wav', "rb") as audio_file:
content = audio_file.read()
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
language_code="en-US",
# Enable automatic punctuation
enable_automatic_punctuation=True,
)
response = client.recognize(config=config, audio=audio)
for result in response.results:
extr = result.alternatives[0].transcript
print(extr)
with open("guru9.txt","w+") as f:
f.write(extr)
f.close()
What happens in your code is, per iteration you open, write, close your file. You should move out your opening and closing of your file outside the loop.
myfile = open("guru9.txt","w+")
for result in response.results:
extr = result.alternatives[0].transcript
myfile.write(extr)
myfile.close()

Google cloud speech never gives a response

I am using Google Cloud Speech like the following:
def transcribe_file_with_word_time_offsets(speech_files):
"""Transcribe the given audio file synchronously and output the word time
offsets."""
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
client = speech.SpeechClient()
files = sorted(glob.glob(speech_files))
starting_number = 0
filenames_and_text = []
for f in files:
with io.open(f, 'rb') as audio_file:
content = audio_file.read()
audio = speech.types.RecognitionAudio(content=content)
config = speech.types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=22050,
language_code='en-UK')
response = client.recognize(config , audio)
This won't result in an error, just hang forever when it gets to the response = client.recognize(config , audio) line. I had used this exact script before without issues but now it hasn't worked for a long time.

Saving a stream while playing it using LibVLC

Using LibVLC, I'm trying to save a stream while playing it. This is the python code:
import os
import sys
import vlc
if __name__ == '__main__':
filepath = <either-some-url-or-local-path>
movie = os.path.expanduser(filepath)
if 'http://' not in filepath:
if not os.access(movie, os.R_OK):
print ( 'Error: %s file is not readable' % movie )
sys.exit(1)
instance = vlc.Instance("--sub-source marq --sout=file/ps:example.mpg")
try:
media = instance.media_new(movie)
except NameError:
print ('NameError: % (%s vs Libvlc %s)' % (sys.exc_info()[1],
vlc.__version__, vlc.libvlc_get_version()))
sys.exit(1)
player = instance.media_player_new()
player.set_media(media)
player.play()
#dont exit!
while(1):
continue
It saves the video stream to a file example.mpg. As per this doc, the command to save a stream is this :
--sout=file/ps:example.mpg
which I've using when creating an instance of vlc.Instance:
instance = vlc.Instance("--sub-source marq --sout=file/ps:example.mpg")
But the problem is that it only saves the stream, it doesn't play the stream simultaneously.
Is there any way (in LibVLC) I can save the stream (to a local file) while paying it?
Although, I'm looking for a solution in Python 3.3.1 but it is fine if there is any C or C++ solution.
I've created a similar, but not duplicate, topic yesterday.
Idea:
The basic idea is simple enough. You have to duplicate the output stream and redirect it to a file. This is done, as Maresh correctly pointed out, using the sout=#duplicate{...} directive.
Working Solution:
The following solution works on my machine ™. I've tested it on Ubuntu 12.10 with VLC v2.0.3 (TwoFlower) and Python 2.7.1. I think it should also work on Python 3 since most of the heavy lifting is done by libVlc anyway.
import os
import sys
import vlc
if __name__ == '__main__':
#filepath = <either-some-url-or-local-path>
movie = os.path.expanduser(filepath)
if 'http://' not in filepath:
if not os.access(movie, os.R_OK):
print ( 'Error: %s file is not readable' % movie )
sys.exit(1)
instance = vlc.Instance("--sout=#duplicate{dst=file{dst=example.mpg},dst=display}")
try:
media = instance.media_new(movie)
except NameError:
print ('NameError: % (%s vs Libvlc %s)' % (sys.exc_info()[1],
vlc.__version__, vlc.libvlc_get_version()))
sys.exit(1)
player = instance.media_player_new()
player.set_media(media)
player.play()
#dont exit!
while(1):
continue
Helpful Links
The Command-Line help was essential to decipher the plethora of VLCs
command line options.
Chapter 3 of VLC streaming HowTo. Explains the structure of the stream output, its directives and describes of the various available modules. Chapter 4 shows some examples.
LibVLC API documentation in case you want to change media option at
runtime
Update - Saving YouTube videos:
The above code doesn't play nice with YouTube. I searched around and discovered that an additional transcode directive can be used to convert YouTube's video stream to a regular video format. I used #transcode{vcodec=mp4v,acodec=mpga,vb=800,ab=128,deinterlace}
vcodec=mp4v is the video format you want to encode in (mp4v is MPEG-4, mpgv is MPEG-1, and there is also h263, DIV1, DIV2, DIV3, I420, I422, I444, RV24, YUY2).
acodec=mpga is the audio format you want to encode in (mpga is MPEG audio layer 2, a52 is A52 i.e. AC3 sound).
vb=800 is the video bitrate in Kbit/s.
ab=128 is the audio bitrate in Kbit/s.
deinterlace tells VLC to deinterlace the video on the fly.
The updated code looks like this:
import os
import sys
import vlc
if __name__ == '__main__':
#filepath = <either-some-url-or-local-path>
filepath = "http://r1---sn-nfpnnjvh-1gil.c.youtube.com/videoplayback?source=youtube&newshard=yes&fexp=936100%2C906397%2C928201%2C929117%2C929123%2C929121%2C929915%2C929906%2C929907%2C929125%2C929127%2C925714%2C929917%2C929919%2C912512%2C912515%2C912521%2C906838%2C904485%2C906840%2C931913%2C904830%2C919373%2C933701%2C904122%2C932216%2C936303%2C909421%2C912711%2C907228%2C935000&sver=3&expire=1373237257&mt=1373214031&mv=m&ratebypass=yes&id=1907b7271247a714&ms=au&ipbits=48&sparams=cp%2Cid%2Cip%2Cipbits%2Citag%2Cratebypass%2Csource%2Cupn%2Cexpire&itag=45&key=yt1&ip=2a02%3A120b%3Ac3c6%3A7190%3A6823%3Af2d%3A732c%3A3577&upn=z3zzcrvPC0U&cp=U0hWSFJOVV9KUUNONl9KSFlDOmt4Y3dEWFo3dDFu&signature=D6049FD7CD5FBD2CC6CD4D60411EE492AA0E9A77.5D0562CCF4E10A6CC53B62AAFFF6CB3BB0BA91C0"
movie = os.path.expanduser(filepath)
savedcopy = "yt-stream.mpg"
if 'http://' not in filepath:
if not os.access(movie, os.R_OK):
print ( 'Error: %s file is not readable' % movie )
sys.exit(1)
instance = vlc.Instance("--sout=#transcode{vcodec=mp4v,acodec=mpga,vb=800,ab=128,deinterlace}:duplicate{dst=file{dst=%s},dst=display}" % savedcopy)
try:
media = instance.media_new(movie)
except NameError:
print ('NameError: % (%s vs Libvlc %s)' % (sys.exc_info()[1],
vlc.__version__, vlc.libvlc_get_version()))
sys.exit(1)
player = instance.media_player_new()
player.set_media(media)
player.play()
#dont exit!
while(1):
continue
A couple of important points:
I've used MPEG audio and video codecs in the transcode directive. It seems to be important to use a matching extensions for the output file (mpg in this case). Otherwise VLC gets confused when opening the saved file for playback. Keep that in mind if you decide to switch to another video format.
You cannot add a regular YouTube URL as filepath. Instead you have to specify the location of the video itself. That's the reason why the filepath that I've used looks so cryptic. That filepath corresponds to video at http://www.youtube.com/watch?v=GQe3JxJHpxQ. VLC itself is able to extract the video location from a given YouTube URL, but libVLC doesn't do that out of the box. You'll have to write your own resolver to do that. See this related SO question. I followed this approach to manually resolve the video location for my tests.
I think you need to duplicate the output in order to play and record it at the same time:
vlc.Instance("--sub-source marq --sout=#stream_out_duplicate{dst=display,dst=std{access=file,mux=ts,dst=/path/file.mpg}}")
or
libvlc_media_add_option(media, ":sout=#stream_out_duplicate{dst=display,dst=std{access=file,mux=ts,dst=/path/file.mpg}}")
Did you try adding to the list of options the following option?
--sout-display
i.e.
instance = vlc.Instance("--sub-source marq --sout=file/ps:example.mpg --sout-display")
Some time ago in a sample code in the active state website i saw someone played and recorded a MP3 file using VLC using the vlc.py module. You can take a look at it's sample code to see how to duplicate a stream. I copied th code here for you (I copied it from http://code.activestate.com/recipes/577802-using-vlcpy-to-record-an-mp3-and-save-a-cue-file/):
import vlc
import time
import os
def new_filename(ext = '.mp3'):
"find a free filename in 00000000..99999999"
D = set(x[:8] for x in os.listdir('.')
if (x.endswith(ext) or x.endswith('.cue')) and len(x) == 12)
for i in xrange(10**8):
s = "%08i" %i
if s not in D:
return s
def initialize_cue_file(name,instream,audiofile):
"create a cue file and write some data, then return it"
cueout = '%s.cue' %name
outf = file(cueout,'w')
outf.write('PERFORMER "%s"\n' %instream)
outf.write('TITLE "%s"\n' %name)
outf.write('FILE "%s" WAVE\n' %audiofile)
outf.flush()
return outf
def initialize_player(instream, audiofile):
"initialize a vlc player which plays locally and saves to an mp3file"
inst = vlc.Instance()
p = inst.media_player_new()
cmd1 = "sout=#duplicate{dst=file{dst=%s},dst=display}" %audiofile
cmd2 ="no-sout-rtp-sap"
cmd3 = "no-sout-standard-sap"
cmd4 ="sout-keep"
med=inst.media_new(instream,cmd1,cmd2,cmd3,cmd4)
med.get_mrl()
p.set_media(med)
return p, med
def write_track_meta_to_cuefile(outf,instream,idx,meta,millisecs):
"write the next track info to the cue file"
outf.write(' TRACK %02i AUDIO\n' %idx)
outf.write(' TITLE "%s"\n' %meta)
outf.write(' PERFORMER "%s"\n' %instream)
m = millisecs // 60000
s = (millisecs - (m*60000)) // 1000
hs = (millisecs - (m*60000) - (s*1000)) //10
ts = '%02i:%02i:%02i' %(m,s,hs)
outf.write(' INDEX 01 %s\n' %ts)
outf.flush()
def test():
#some online audio stream for which this currently works ....
instream = 'http://streamer-mtc-aa05.somafm.com:80/stream/1018'
#if the output filename ends with mp3 vlc knows which mux to use
ext = '.mp3'
name = new_filename(ext)
audiofile = '%s%s' %(name,ext)
outf = initialize_cue_file(name,instream,audiofile)
p,med = initialize_player(instream, audiofile)
p.play()
np = None
i = 0
while 1:
time.sleep(.1)
new = med.get_meta(12)
if new != np:
i +=1
t = p.get_time()
print "millisecs: %i" %t
write_track_meta_to_cuefile(outf,instream,i,new,t)
np = new
print "now playing: %s" %np
if __name__=='__main__':
test()
Perhaps you need to clone your output, as suggested on the forum?