I'm using google speech-to-text using the command line and getting weird results
this is my command
gcloud beta ml speech recognize-long-running gs://my_bucket_name/call0.mp3
--language-code=en-US --async --include-word-time-offsets --enable-speaker-diarization
--diarization-speaker-count=2
This is the audio file:
https://dcs.megaphone.fm/LIT9020259030.mp3?key=4b567156fd7bdfaa90992664d4bc667c
The problems are:
the results is very very bad and inaccurate
the last result contains all the other results combined
the speakerTag only presents in the last result
I got the speakerTag only for speaker 1
Here's the result json:
{
"done": true,
"metadata": {
"#type": "type.googleapis.com/google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata",
"lastUpdateTime": "2020-07-13T18:56:33.689140Z",
"progressPercent": 100,
"startTime": "2020-07-13T18:27:45.757871Z",
"uri": "gs://deepagent-db032.appspot.com/conmagi/call1.mp3"
},
"name": "398565854464473919",
"response": {
"#type": "type.googleapis.com/google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse",
"results": [
{
"alternatives": [
{
"confidence": 0.87135065,
"transcript": "love",
"words": [
{
"endTime": "11.300s",
"startTime": "10.400s",
"word": "love"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.48216835,
"transcript": "you are",
"words": [
{
"endTime": "425.100s",
"startTime": "424.500s",
"word": "you"
},
{
"endTime": "425.400s",
"startTime": "425.100s",
"word": "are"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.9194219,
"transcript": "how far is it from",
"words": [
{
"endTime": "475.200s",
"startTime": "473.800s",
"word": "how"
},
{
"endTime": "475.500s",
"startTime": "475.200s",
"word": "far"
},
{
"endTime": "475.700s",
"startTime": "475.500s",
"word": "is"
},
{
"endTime": "475.800s",
"startTime": "475.700s",
"word": "it"
},
{
"endTime": "476.100s",
"startTime": "475.800s",
"word": "from"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.823343,
"transcript": "I want",
"words": [
{
"endTime": "629.200s",
"startTime": "626.700s",
"word": "I"
},
{
"endTime": "629.800s",
"startTime": "629.200s",
"word": "want"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.56559134,
"transcript": "Blue Ivy",
"words": [
{
"endTime": "990.100s",
"startTime": "989.500s",
"word": "Blue"
},
{
"endTime": "991.100s",
"startTime": "990.100s",
"word": "Ivy"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.78465956,
"transcript": "how old is Wawa",
"words": [
{
"endTime": "1599.700s",
"startTime": "1598.500s",
"word": "how"
},
{
"endTime": "1600.100s",
"startTime": "1599.700s",
"word": "old"
},
{
"endTime": "1600.200s",
"startTime": "1600.100s",
"word": "is"
},
{
"endTime": "1600.600s",
"startTime": "1600.200s",
"word": "Wawa"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.9475956,
"transcript": "how are you",
"words": [
{
"endTime": "2022.400s",
"startTime": "2020s",
"word": "how"
},
{
"endTime": "2022.500s",
"startTime": "2022.400s",
"word": "are"
},
{
"endTime": "2022.600s",
"startTime": "2022.500s",
"word": "you"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.7494768,
"transcript": "New York mall",
"words": [
{
"endTime": "2066.200s",
"startTime": "2065.800s",
"word": "New"
},
{
"endTime": "2066.500s",
"startTime": "2066.200s",
"word": "York"
},
{
"endTime": "2067s",
"startTime": "2066.500s",
"word": "mall"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.6706576,
"transcript": "call",
"words": [
{
"endTime": "2255.600s",
"startTime": "2254.500s",
"word": "call"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.87819797,
"transcript": "call Paul Wall",
"words": [
{
"endTime": "3041.500s",
"startTime": "3040.300s",
"word": "call"
},
{
"endTime": "3041.800s",
"startTime": "3041.500s",
"word": "Paul"
},
{
"endTime": "3042.300s",
"startTime": "3041.800s",
"word": "Wall"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.8331511,
"transcript": "no",
"words": [
{
"endTime": "3101.300s",
"startTime": "3100.800s",
"word": "no"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.62488914,
"transcript": "call Jeff",
"words": [
{
"endTime": "3473.100s",
"startTime": "3470.300s",
"word": "call"
},
{
"endTime": "3473.500s",
"startTime": "3473.100s",
"word": "Jeff"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.9074697,
"transcript": "call home",
"words": [
{
"endTime": "4166.100s",
"startTime": "4162.400s",
"word": "call"
},
{
"endTime": "4166.400s",
"startTime": "4166.100s",
"word": "home"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.7917781,
"transcript": "how old are you",
"words": [
{
"endTime": "4231.800s",
"startTime": "4231.300s",
"word": "how"
},
{
"endTime": "4232.200s",
"startTime": "4231.800s",
"word": "old"
},
{
"endTime": "4232.300s",
"startTime": "4232.200s",
"word": "are"
},
{
"endTime": "4232.400s",
"startTime": "4232.300s",
"word": "you"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.70297575,
"transcript": " Europe",
"words": [
{
"endTime": "4244.200s",
"startTime": "4243s",
"word": "Europe"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.84273374,
"transcript": " how are you",
"words": [
{
"endTime": "5121.500s",
"startTime": "5115.300s",
"word": "how"
},
{
"endTime": "5122.100s",
"startTime": "5121.500s",
"word": "are"
},
{
"endTime": "5122.300s",
"startTime": "5122.100s",
"word": "you"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.7561751,
"transcript": " the only one",
"words": [
{
"endTime": "6199.900s",
"startTime": "6199.600s",
"word": "the"
},
{
"endTime": "6200.400s",
"startTime": "6199.900s",
"word": "only"
},
{
"endTime": "6200.800s",
"startTime": "6200.400s",
"word": "one"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.6547922,
"transcript": " call",
"words": [
{
"endTime": "6258.800s",
"startTime": "6256.800s",
"word": "call"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.9402823,
"transcript": " Walgreens",
"words": [
{
"endTime": "6925s",
"startTime": "6912.300s",
"word": "Walgreens"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.5217668,
"transcript": " we want to watch",
"words": [
{
"endTime": "7155.900s",
"startTime": "7155.500s",
"word": "we"
},
{
"endTime": "7156.500s",
"startTime": "7155.900s",
"word": "want"
},
{
"endTime": "7156.600s",
"startTime": "7156.500s",
"word": "to"
},
{
"endTime": "7156.700s",
"startTime": "7156.600s",
"word": "watch"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.7971729,
"transcript": " I love you",
"words": [
{
"endTime": "7199.900s",
"startTime": "7199.200s",
"word": "I"
},
{
"endTime": "7202.900s",
"startTime": "7199.900s",
"word": "love"
},
{
"endTime": "7203.100s",
"startTime": "7202.900s",
"word": "you"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"confidence": 0.8566783,
"transcript": " how old is Moana",
"words": [
{
"endTime": "7483.800s",
"startTime": "7481.300s",
"word": "how"
},
{
"endTime": "7484s",
"startTime": "7483.800s",
"word": "old"
},
{
"endTime": "7484.200s",
"startTime": "7484s",
"word": "is"
},
{
"endTime": "7484.300s",
"startTime": "7484.200s",
"word": "Moana"
}
]
}
],
"languageCode": "en-us"
},
{
"alternatives": [
{
"words": [
{
"endTime": "11.300s",
"speakerTag": 1,
"startTime": "10.400s",
"word": "love"
},
{
"endTime": "425.100s",
"speakerTag": 1,
"startTime": "424.500s",
"word": "you"
},
{
"endTime": "425.400s",
"speakerTag": 1,
"startTime": "425.100s",
"word": "are"
},
{
"endTime": "475.200s",
"speakerTag": 1,
"startTime": "473.800s",
"word": "how"
},
{
"endTime": "475.500s",
"speakerTag": 1,
"startTime": "475.200s",
"word": "far"
},
{
"endTime": "475.700s",
"speakerTag": 1,
"startTime": "475.500s",
"word": "is"
},
{
"endTime": "475.800s",
"speakerTag": 1,
"startTime": "475.700s",
"word": "it"
},
{
"endTime": "476.100s",
"speakerTag": 1,
"startTime": "475.800s",
"word": "from"
},
{
"endTime": "629.200s",
"speakerTag": 1,
"startTime": "626.700s",
"word": "I"
},
{
"endTime": "629.800s",
"speakerTag": 1,
"startTime": "629.200s",
"word": "want"
},
{
"endTime": "990.100s",
"speakerTag": 1,
"startTime": "989.500s",
"word": "Blue"
},
{
"endTime": "991.100s",
"speakerTag": 1,
"startTime": "990.100s",
"word": "Ivy"
},
{
"endTime": "1599.700s",
"speakerTag": 1,
"startTime": "1598.500s",
"word": "how"
},
{
"endTime": "1600.100s",
"speakerTag": 1,
"startTime": "1599.700s",
"word": "old"
},
{
"endTime": "1600.200s",
"speakerTag": 1,
"startTime": "1600.100s",
"word": "is"
},
{
"endTime": "1600.600s",
"speakerTag": 1,
"startTime": "1600.200s",
"word": "Wawa"
},
{
"endTime": "2022.400s",
"speakerTag": 1,
"startTime": "2020s",
"word": "how"
},
{
"endTime": "2022.500s",
"speakerTag": 1,
"startTime": "2022.400s",
"word": "are"
},
{
"endTime": "2022.600s",
"speakerTag": 1,
"startTime": "2022.500s",
"word": "you"
},
{
"endTime": "2066.200s",
"speakerTag": 1,
"startTime": "2065.800s",
"word": "New"
},
{
"endTime": "2066.500s",
"speakerTag": 1,
"startTime": "2066.200s",
"word": "York"
},
{
"endTime": "2067s",
"speakerTag": 1,
"startTime": "2066.500s",
"word": "mall"
},
{
"endTime": "2255.600s",
"speakerTag": 1,
"startTime": "2254.500s",
"word": "call"
},
{
"endTime": "3041.500s",
"speakerTag": 1,
"startTime": "3040.300s",
"word": "call"
},
{
"endTime": "3041.800s",
"speakerTag": 1,
"startTime": "3041.500s",
"word": "Paul"
},
{
"endTime": "3042.300s",
"speakerTag": 1,
"startTime": "3041.800s",
"word": "Wall"
},
{
"endTime": "3101.300s",
"speakerTag": 1,
"startTime": "3100.800s",
"word": "no"
},
{
"endTime": "3473.100s",
"speakerTag": 1,
"startTime": "3470.300s",
"word": "call"
},
{
"endTime": "3473.500s",
"speakerTag": 1,
"startTime": "3473.100s",
"word": "Jeff"
},
{
"endTime": "4166.100s",
"speakerTag": 1,
"startTime": "4162.400s",
"word": "call"
},
{
"endTime": "4166.400s",
"speakerTag": 1,
"startTime": "4166.100s",
"word": "home"
},
{
"endTime": "4231.800s",
"speakerTag": 1,
"startTime": "4231.300s",
"word": "how"
},
{
"endTime": "4232.200s",
"speakerTag": 1,
"startTime": "4231.800s",
"word": "old"
},
{
"endTime": "4232.300s",
"speakerTag": 1,
"startTime": "4232.200s",
"word": "are"
},
{
"endTime": "4232.400s",
"speakerTag": 1,
"startTime": "4232.300s",
"word": "you"
},
{
"endTime": "4244.200s",
"speakerTag": 1,
"startTime": "4243s",
"word": "Europe"
},
{
"endTime": "5121.500s",
"speakerTag": 1,
"startTime": "5115.300s",
"word": "how"
},
{
"endTime": "5122.100s",
"speakerTag": 1,
"startTime": "5121.500s",
"word": "are"
},
{
"endTime": "5122.300s",
"speakerTag": 1,
"startTime": "5122.100s",
"word": "you"
},
{
"endTime": "6199.900s",
"speakerTag": 1,
"startTime": "6199.600s",
"word": "the"
},
{
"endTime": "6200.400s",
"speakerTag": 1,
"startTime": "6199.900s",
"word": "only"
},
{
"endTime": "6200.800s",
"speakerTag": 1,
"startTime": "6200.400s",
"word": "one"
},
{
"endTime": "6258.800s",
"speakerTag": 1,
"startTime": "6256.800s",
"word": "call"
},
{
"endTime": "6925s",
"speakerTag": 1,
"startTime": "6912.300s",
"word": "Walgreens"
},
{
"endTime": "7155.900s",
"speakerTag": 1,
"startTime": "7155.500s",
"word": "we"
},
{
"endTime": "7156.500s",
"speakerTag": 1,
"startTime": "7155.900s",
"word": "want"
},
{
"endTime": "7156.600s",
"speakerTag": 1,
"startTime": "7156.500s",
"word": "to"
},
{
"endTime": "7156.700s",
"speakerTag": 1,
"startTime": "7156.600s",
"word": "watch"
},
{
"endTime": "7199.900s",
"speakerTag": 1,
"startTime": "7199.200s",
"word": "I"
},
{
"endTime": "7202.900s",
"speakerTag": 1,
"startTime": "7199.900s",
"word": "love"
},
{
"endTime": "7203.100s",
"speakerTag": 1,
"startTime": "7202.900s",
"word": "you"
},
{
"endTime": "7483.800s",
"speakerTag": 1,
"startTime": "7481.300s",
"word": "how"
},
{
"endTime": "7484s",
"speakerTag": 1,
"startTime": "7483.800s",
"word": "old"
},
{
"endTime": "7484.200s",
"speakerTag": 1,
"startTime": "7484s",
"word": "is"
},
{
"endTime": "7484.300s",
"speakerTag": 1,
"startTime": "7484.200s",
"word": "Moana"
}
]
}
]
}
]
}
}
I ran into the same issues, specially related to diarization where there is not a good performance.
I have tried to get my script from AWS as well, but I found the word error rate was higher, but it is better recognizing transition from person to person.
As you are aware, this is a beta feature and being in that stage they don't have a SLA (Service Level Agreement) to accomplish.
I reported this bug to the Google Team, and they have replied:
There are no SLAs or technical support obligations in a beta release
unless otherwise specified in product terms[...]. The average beta
phase lasts about six months.
So I believe it will take a while for the team to officially release this feature.
https://cloud.google.com/speech-to-text/docs/multiple-voices
Speaker Tag, is deprecated in this API, and speakerTag will hardly give the accurate results, I would suggest you to use ChannelTag in place of SpeakerTag like result.channelTag, you might get better results.
I am using google cloud-build as my CI system and Spinnaker as my CD.
I have configured cloud-build to copy my manifest and include it as an artifact.
that's my cloudbuild.yaml:
- name: 'gcr.io/cloud-builders/docker'
args: ['build', '--tag=gcr.io/$PROJECT_ID/gordion-backend:$COMMIT_SHA', '.']
- name: 'gcr.io/cloud-builders/gsutil'
args: ['cp', 'manifests/manifest.yaml', 'gs://$PROJECT_ID-kubernetes-manifests/gordion-backend/']
images: ['gcr.io/$PROJECT_ID/gordion-backend']
artifacts:
objects:
location: 'gs://${_BUCKET_NAME}/$COMMIT_SHA/'
paths: ['manifests/manifest.yaml']
When the build is over, that's the message being broadcast by pub/sub:
{
"id": "ba084f1c-494a-42c0-8499-1b84fbb30551",
"projectId": "gordion-2",
"status": "SUCCESS",
"source": {
"repoSource": {
"projectId": "gordion-2",
"repoName": "github_gordion-beam_gordion-backend",
"branchName": "feature/build-yaml"
}
},
"steps": [
{
"name": "gcr.io/cloud-builders/docker",
"args": [
"build",
"--tag=gcr.io/gordion-2/gordion-backend:936543554ad5fab4017319005413dd38beefda04",
"."
],
"timing": {
"startTime": "2019-10-04T12:03:46.996926010Z",
"endTime": "2019-10-04T12:03:57.357589900Z"
},
"pullTiming": {
"startTime": "2019-10-04T12:03:46.996926010Z",
"endTime": "2019-10-04T12:03:47.056257759Z"
},
"status": "SUCCESS"
},
{
"name": "gcr.io/cloud-builders/gsutil",
"args": [
"cp",
"manifests/manifest.yaml",
"gs://gordion-2-kubernetes-manifests/gordion-backend/"
],
"timing": {
"startTime": "2019-10-04T12:03:57.357662249Z",
"endTime": "2019-10-04T12:04:00.089381056Z"
},
"pullTiming": {
"startTime": "2019-10-04T12:03:57.357662249Z",
"endTime": "2019-10-04T12:03:57.436904050Z"
},
"status": "SUCCESS"
}
],
"results": {
"images": [
{
"name": "gcr.io/gordion-2/gordion-backend:936543554ad5fab4017319005413dd38beefda04",
"digest": "sha256:538771fe1be2abd54f6d092519c131019d64bd1001ba38138c565faa9fe343b6",
"pushTiming": {
"startTime": "2019-10-04T12:04:00.404451498Z",
"endTime": "2019-10-04T12:04:06.089323750Z"
}
}
],
"buildStepImages": [
"sha256:c0525aac022b1a92e97f9c6cf4ede4dd82979a9f014f05b2ec8843012a03aa60",
"sha256:fbd483fa382118462a136f916b17d3197325881b6d966ba82ee3a54f4b550e76"
],
"artifactManifest": "gs://gordion-manifests/gordion-2/936543554ad5fab4017319005413dd38beefda04/artifacts-ba084f1c-494a-42c0-8499-1b84fbb30551.json",
"numArtifacts": "1"
},
"createTime": "2019-10-04T12:03:32.668464866Z",
"startTime": "2019-10-04T12:03:38.343591796Z",
"finishTime": "2019-10-04T12:04:15.243708Z",
"timeout": "600s",
"images": [
"gcr.io/gordion-2/gordion-backend"
],
"artifacts": {
"images": [
"gcr.io/gordion-2/gordion-backend"
],
"objects": {
"location": "gs://gordion-manifests/gordion-2/936543554ad5fab4017319005413dd38beefda04/",
"paths": [
"manifests/manifest.yaml"
],
"timing": {
"startTime": "2019-10-04T12:04:08.060372060Z",
"endTime": "2019-10-04T12:04:12.248879364Z"
}
}
},
"logsBucket": "gs://1039931537996.cloudbuild-logs.googleusercontent.com",
"sourceProvenance": {
"resolvedRepoSource": {
"projectId": "gordion-2",
"repoName": "github_gordion-beam_gordion-backend",
"commitSha": "936543554ad5fab4017319005413dd38beefda04"
}
},
"buildTriggerId": "6b7405e7-c2db-4412-9f1a-186c7f7d5975",
"options": {
"substitutionOption": "ALLOW_LOOSE",
"logging": "LEGACY"
},
"logUrl": "https://console.cloud.google.com/gcr/builds/ba084f1c-494a-42c0-8499-1b84fbb30551?project=1039931537996",
"substitutions": {
"_BUCKET_NAME": "gordion-manifests"
},
"tags": [
"trigger-6b7405e7-c2db-4412-9f1a-186c7f7d5975"
],
"timing": {
"BUILD": {
"startTime": "2019-10-04T12:03:46.996877211Z",
"endTime": "2019-10-04T12:04:00.404416077Z"
},
"FETCHSOURCE": {
"startTime": "2019-10-04T12:03:39.677294695Z",
"endTime": "2019-10-04T12:03:46.930866092Z"
},
"PUSH": {
"startTime": "2019-10-04T12:04:00.404450167Z",
"endTime": "2019-10-04T12:04:14.358324463Z"
}
}
}
I can also see in cloud-build history that it produced 2 artifacts:
docker image
Kubernetes manifest
the manifest is located in this location:
gs://gordion-manifests/a91cb009fa184713b86eb8b532a75dc088a25713/manifest.yaml gs://"gordion-manifests
As the location of the artifact is dynamic (due to folders named after SHA), How do I consume this artifact in expected artifacts in spinnaker?
You can use wildcards in artifact's matcher.
Please refer to documentation - https://www.spinnaker.io/reference/artifacts/in-pipelines/#expected-artifacts for overall description and
https://www.spinnaker.io/reference/artifacts-with-artifactsrewrite/types/gcs-object/#in-a-trigger for GCS artifact details.
For example following matcher should handle your case (I don't have instance to test it properly):
{
"type": "gcs/object",
"name": "gs://gordion-manifests/.*/manifest.yaml"
}