NAV Navbar
cURL Node.js C# PHP
  • Introduction
  • Asynchronous interface
  • Realtime Websocket
  • References
  • Introduction

    You can access the Speech to Text service through a WebSocket interface and an asynchronous HTTP interface.

    We have language bindings in Shell, Node.js, C# and PHP! You can view code examples in the dark area to the right. By clicking on the tabs in the top right of the screen you can switch between the different programming languages.

    Asynchronous interface

    Our Speech to Text service requires tokens to allow access to the API. You can create a new token after logging in to your personal portal.

    Create Session

    To initialize a new session use the following code:

    # With Shell, you can simply add the correct header with each request
    curl -X POST "https://api.zoommedia.ai/api/v1/speech-to-text/session" \
      -H "Content-Type: application/json" \
      -H "X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN" \
      -d '{ "language": "xx-xx" }'
    
    var rp = require('request-promise');
    var fs = require('fs');
    var session;
    var headers = {
      "X-Zoom-S2T-Token": "YOUR-S2T-API-TOKEN",
    }
    
    rp.post({
        url: "https://api.zoommedia.ai/api/v1/api/v1/speech-to-text/session/",
        headers: headers,
        json: {
            language: "nl-nl",
            punctuation: false
        }
    }).then(function(data) {
        console.log("Successfully created the session: ", data.sessionId);
        session = data.sessionId;
    }).catch(function(error){
        console.log("Error creating the session", error);
    }
    
    <?php
    $url = "https://api.zoommedia.ai/api/v1/speech-to-text/session";
    $headers = [
      'Content-Type: application/json',
      'X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN'
    ];
    $body = array(
      "language" => "nl-nl",
      "punctation" => false
    );
    
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($body));
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    
    $server_output = curl_exec ($ch);
    curl_close ($ch);
    ?>
    
    private const string token = "YOUR-S2T-API-TOKEN";
    static string SessionId { get; set; }
    
    private static async Task OpenSession(){
      using (var httpClient = new HttpClient()){
        httpClient.DefaultRequestHeaders.Add("X-Zoom-S2T-Key", token);
        StringContent stringContent = new StringContent("{\"language\":\"nl-nl\"}");
        stringContent.Headers.ContentType = new MediaTypeHeaderValue("application/json");
        HttpResponseMessage httpResponseMessage = await httpClient.PostAsyn("https://api.zoommedia.ai/api/v1/speech-to-text/session", stringContent.ConfigureAwait(false);
        if (httpResponseMessage.IsSuccessStatusCode){
            string sessionString = await httpResponseMessage.Content.ReadAsStringAsync();
            JToken token = JObject.Parse(sessionString);
            SessionId = (string)token.SelectToken("sessionId");
        }
        else{
            string failureMsg = "HTTP Status: " + httpResponseMessage.StatusCode.ToString() + " - Reason: " + httpResponseMessage.ReasonPhrase;
        }
      }
    }
    

    The above command will return JSON which is structured as follows:

    {
      "sessionId":"abcdef1234567890fedcba00",
      "language":"xx-xx"
    }
    

    Make sure to replace YOUR-S2T-API-TOKEN with the API token you created.

    Initiate a new asynchronous Speech to Text session for a specific language.

    HTTP Request

    POST /api/v1/speech-to-text/session

    Request Headers

    Parameter Value Description
    Content-Type application/json Set the language for this session
    X-Zoom-S2T-Key YOUR-S2T-API-TOKEN API Key neeeded for request authorization

    Request payload

    Parameter Required Type Description
    language true string Set the language for this session
    callback_url false string If set a HTTPS callback will be made to a web endpoint once the transcription is done
    callback_method false string Specify the method to use for the HTTP callback (allowed_values: POST, PUT)
    callback_format false string Set this to specify a transcription format (default: application/json)
    callback_headers false array Array of headers that needs to be present in the callback request.
    punctuation false bool If set to true punctuation will be enabled (default: false)

    File upload

    # With Shell, you can simply add the correct header with each request
    curl -v \
        -H "X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN" \
        -F upload=@localfilenamepath \
        "https://api.zoommedia.ai/api/v1/speech-to-text/session/SESSION_ID"
    
    var formData = {
        upload: fs.createReadStream("FILEPATH.wav")
    };
    
    rp.post({
        url: "https://api.zoommedia.ai/api/v1/api/v1/speech-to-text/session/" + session,
        headers: headers,
        formData: formData
    }).then(function(data) {
        console.log("File has been successfully uploaded");
    }).catch(function(error){
        console.log("Error creating the session", error);
    }
    
    <?php
    $url = "https://api.zoommedia.ai/api/v1/speech-to-text/session/".$SESSION_ID;
    $headers = [
      'Content-Type: multipart/form-data',
      'X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN'
    ];
    $body = array(
      "upload" => $file
    );
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($body));
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    $server_output = curl_exec ($ch);
    curl_close ($ch);
    ?>
    
     private static async Task<string> Upload(byte[] fileBytes){
      HttpContent bytesContent = new ByteArrayContent(fileBytes);
    
      using (var client = new HttpClient()){
        client.DefaultRequestHeaders.Add("X-Zoom-S2T-Key", "YOUR-S2T-API-TOKEN");
        using (var formData = new MultipartFormDataContent()){
            formData.Add(bytesContent, "upload", "file");
            var response = await client.PostAsync(string.Format("https://api.zoommedia.ai/api/v1/api/v1/speech-to-text/session/{0}", SESSION_ID), formData).ConfigureAwait(false);
            if (!response.IsSuccessStatusCode){
              return null;
            }
            return await response.Content.ReadAsStringAsync();
        }
      }
    }
    

    The above command will return JSON which is structured as follows:

    {
      "sessionId":"abcdef1234567890fedcba00",
      "done":"false"
    }
    

    HTTP Request

    POST /api/v1/speech-to-text/session/SESSION_ID

    Request Headers

    Parameter Value Description
    Content-type multipart/form-data Set the content-type of the request. Note: you need to use application/json if you want to send a video url instead of a file.
    X-Zoom-S2T-Key YOUR-S2T-API-TOKEN API Key is needed for request authorization.

    Path parameters

    Parameter Required Type Description
    SESSION_ID true string Session ID of the request.

    Body parameters for content-type: multipart/form-data (file upload)

    Parameter Required Type Description
    upload true string File to be processed.

    Body parameters for content-type: application/json (video url)

    Parameter Required Type Description
    video_url true string URL of the video file you wish to process.

    Check state

    # With Shell, you can simply add the correct header with each request.
    curl -X GET \
        -H "Content-Type: application/json" \
        -H "X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN" \
        "https://api.zoommedia.ai/api/v1/speech-to-text/session/SESSION_ID"
    
    var rp = require('request-promise');
    var fs = require('fs');
    var headers = {
      "X-Zoom-S2T-Token": "YOUR-S2T-API-TOKEN",
    }
    rp.get({
        url: "https://api.zoommedia.ai/api/v1/api/v1/speech-to-text/session/SESSION_ID",
        headers: headersWithAcceptedFormat
    }).then(function(response) {
        //handle response
    }).catch(function(error){
        console.log("Error getting the results of the session", error);
    }
    
    <?php
    $url = "https://api.zoommedia.ai/api/v1/speech-to-text/session/SESSION_ID";
    $headers = [
      'Content-Type: application/json',
      'X-Zoom-S2T-Key: YOUR-S2T-API-TOKEN'
    ];
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    $server_output = curl_exec ($ch);
    curl_close ($ch);
    ?>
    
    static async Task<string> GetResultAsync()
    {
        HttpClient client = new HttpClient();
        client.DefaultRequestHeaders.Add("X-Zoom-S2T-Key", token);
        client.DefaultRequestHeaders
          .Accept
          .Add(new MediaTypeWithQualityHeaderValue("application/json"));
        string actuallyDone = string.Empty;
        HttpResponseMessage response = new HttpResponseMessage();
        while (actuallyDone != "True")
        {
            response = await client.GetAsync(string.Format(https://api.zoommedia.ai/api/v1/speech-to-text/session/{0}", SESSION_ID));
            if (response.IsSuccessStatusCode)
            {
                string sessionString = await response.Content.ReadAsStringAsync();
                JToken token = JObject.Parse(sessionString);
                actuallyDone = (string)token.SelectToken("done");
                if (actuallyDone == "True")
                {
                    break;
                }
                else
                {
                    Thread.Sleep(10000);
                }
            }
        }
        Result = await response.Content.ReadAsStringAsync();
        return Result;
    }
    

    The above command will return JSON structured like if the file is not processed:

    {
      "sessionId":"abcdef1234567890fedcba00",
      "done":"false"
    }
    

    The above command returns JSON structured like if the file is processed:

    {
        "sessionId": "abcdef1234567890fedcba00",
        "done": true,
        "results": [{
          "result": [
            [ "Je", 12046, 12286, 1 ],
            [ "hoort", 12286, 12526, 1 ],
            [ "natuurlijk", 12526, 12796, 1 ],
            [ "zeker", 12796, 13096, 1 ],
            [ "in", 13096, 13156, 1 ],
            [ "amsterdam", 13156, 13666, 1 ],
            [ "verhalen", 13666, 14055, 0.999713 ],
            [ "dat", 14055, 14176, 1 ],
            [ "iedereen", 14176, 14596, 1 ],
            [ "alweer", 14596, 14866, 1 ],
            [ "op", 14866, 14986, 1 ],
            [ "de", 14986, 15076, 1 ],
            [ "stoep", 15076, 15316, 1 ],
            [ "staat,", 15316, 15586, 1 ],
            [ "de", 15586, 15645, 0.999034 ],
            [ "bieden.", 15645, 16066, 1 ]
          ],
          "text": "Je hoort natuurlijk zeker in amsterdam verhalen dat iedereen alweer op de stoep staat, de bieden.",
          "speaker": "unk"
        }],
        "metadata": {
          "format": "wav",
          "filename": "video.mp4",
          "mimetype": "video/mp4",
          "duration": 22
        }
    }
    

    HTTP Request

    GET /api/v1/speech-to-text/session/SESSION_ID

    Request Headers

    Parameter Value Description
    Content-type application/json Header of the request.
    Accept application/json Header that is needed to specify the format of the response. Use this header to request one of the available transcription formats (default: application/json)
    X-Zoom-S2T-Key YOUR-S2T-API-TOKEN API Key needed for request authorization.

    Path parameters

    Parameter Required Type Description
    SESSION_ID true string Session ID of the request.

    Realtime Websocket

    Our Speech to Text service requires tokens to allow access to the API. You can create a new token after logging in to your personal portal.

    Start connection

    To initialize a new session use the following code:

    -
    
    import * as WebSocket from "ws";
    const ws = new WebSocket('wss://api.zoommedia.ai/realtime?language=nl-nl', {
       headers: {
           "X-Zoom-S2T-Key":  YOUR-S2T-API-TOKEN
       }
    });
    ws.onopen = function(evt) { };
    ws.onclose = function(evt) { };
    ws.onmessage = function(evt) { };
    
    <?php
    -
    ?>
    
    private static string _authenticationKey = "YOUR-S2T-API-TOKEN";
    private static ClientWebSocket _webSocket;
    public static void Connect()
    {
        _webSocket = new ClientWebSocket();
        _webSocket.Options.SetRequestHeader("X-Zoom-S2T-Key", _authenticationKey);
        while (_webSocket.State != WebSocketState.Open)
        {
            _webSocket.ConnectAsync(new Uri("wss://api.zoommedia.ai/realtime?language=nl-nl"), CancellationToken.None).Wait();
        }
    }
    

    Make sure to replace YOUR-S2T-API-TOKEN with the API token you created.

    You can reach our API service by using the Websocket Secure (WSS) protocol. The endpoint is:

    WSS Request

    wss://api.zoommedia.ai/realtime?language=nl-nl

    Request Headers

    Parameter Value Description
    X-Zoom-S2T-Key YOUR-S2T-API-TOKEN API Key needed for request authorization.

    Path Parameters

    Parameter Required Type Description
    language true string Set the language for this session.

    Initiate/stop a request

    To initialize a new session use the following code:

    ws.send(JSON.stringify({ action: "start" }));
    
    <?php
    //to do
    ?>
    
    public static async Task SendStart(){
      JsonMessage startMessage = new JsonMessage() { action = "start" };
      var startBuffer = new ArraySegment<Byte>(Encoding.UTF8.GetBytes(JsonConvert.SerializeObject(startMessage)));
      await _webSocket.SendAsync(startBuffer, WebSocketMessageType.Text, true, CancellationToken.None);
    }
    

    The above command will return JSON which is structured as follows:

    {
      "state": "listening"
    }
    

    Make sure to replace YOUR-S2T-API-TOKEN with the API token you created.

    Once the websocket is connected you need to send a message with the ‘action’ in order to start a real-time session. Messages should be JSON text messages.

    Websocket 'action' event

    Value Description
    start Connection with the Speech to Text engine is initialised. If the connection is successful and the websocket is ready to process data you will receive back a "state: listening" message.
    stop Connection with the Speech to Text engine is stopped. The service will return the remaining blob that has to be processed. Once the connection with the Speech to Text engine is closed you will receive a "state: stopped" message. It is not possible to start a new session once you've stopped it.

    Send audio

    -
    
    var readStream = fs.createReadStream(path.join(__dirname, "/FilePath.wav"))
    readStream.on('data', function (chunk) {
      ws.send(chunk);
    }).on('end', function () {
        //send stop action
    });
    
    <?php
    -
    ?>
    
    public static async Task SendAudio(){
      using (FileStream fs = File.OpenRead(_uploadFilePath)){
        byte[] b = new byte[1024];
        while (fs.Read(b, 0, b.Length) > 0){
          await _webSocket.SendAsync(new ArraySegment<byte>(b), WebSocketMessageType.Binary, true, CancellationToken.None);
        }
        // send stop message
      }
    }
    

    The above command will return JSON which is structured as follows:

    // first you get a partial result
    {
      "partial": "Je hoort natuurlijk zeker in amsterdam verhalen dat iedereen alweer op de stoep staat, de bieden."
    }
    
    // then you get the total result including timestamps
    {
      "result": [
        [ "Je", 12046, 12286, 1 ],
        [ "hoort", 12286, 12526, 1 ],
        [ "natuurlijk", 12526, 12796, 1 ],
        [ "zeker", 12796, 13096, 1 ],
        [ "in", 13096, 13156, 1 ],
        [ "amsterdam", 13156, 13666, 1 ],
        [ "verhalen", 13666, 14055, 0.999713 ],
        [ "dat", 14055, 14176, 1 ],
        [ "iedereen", 14176, 14596, 1 ],
        [ "alweer", 14596, 14866, 1 ],
        [ "op", 14866, 14986, 1 ],
        [ "de", 14986, 15076, 1 ],
        [ "stoep", 15076, 15316, 1 ],
        [ "staat,", 15316, 15586, 1 ],
        [ "de", 15586, 15645, 0.999034 ],
        [ "bieden.", 15645, 16066, 1 ]
      ],
      "text": "Je hoort natuurlijk zeker in amsterdam verhalen dat iedereen alweer op de stoep staat, de bieden."
    }
    

    The real-time service only supports "audio/wav" format at the moment. The audio sampling rate required is 16khz and the audio codec should be PCM. When sending other formats than the one described above, the accuracy of the service will decrease, or you might not receive any results at all.

    Audio Conversion

    You can use various tools to convert your audio to a different format. Have a look at the following freeware tools that are available to convert your audio from one format to another:

    Sound eXchange (SoX) ( sox.sourceforge.net ). FFmpeg ( ffmpeg.org ). These tools offer cross-platform support for multiple audio formats.

    References

    Languages

    At this moment we support the following languages:

    Code Language
    da-dk Danish
    en-us US English
    nl-nl Dutch
    nl-be Flemish
    nb-no Norwegian
    sv-se Swedish

    Transcription format

    At this moment we support the following transcription formats:

    Format Description
    application/json Our own proprietary json format as described here
    application/xml+ttml Timed Text Markup Language (TTML) format
    text/vtt Web Video Text Tracks Format
    text/srt SubRip file format
    text/sbv SubViewer file format (Youtube caption file format)

    Output

    The standard JSON format is structured as follows:

    {
      "result": [
        [ "Je", 12046, 12286, 1 ],
        [ "hoort", 12286, 12526, 1 ],
        [ "natuurlijk", 12526, 12796, 1 ],
        [ "zeker", 12796, 13096, 1 ],
        [ "in", 13096, 13156, 1 ],
        [ "amsterdam", 13156, 13666, 1 ],
        [ "verhalen", 13666, 14055, 0.999713 ],
        [ "dat", 14055, 14176, 1 ],
        [ "iedereen", 14176, 14596, 1 ],
        [ "alweer", 14596, 14866, 1 ],
        [ "op", 14866, 14986, 1 ],
        [ "de", 14986, 15076, 1 ],
        [ "stoep", 15076, 15316, 1 ],
        [ "staat,", 15316, 15586, 1 ],
        [ "de", 15586, 15645, 0.999034 ],
        [ "bieden.", 15645, 16066, 1 ]
      ],
      "text": "Je hoort natuurlijk zeker in amsterdam verhalen dat iedereen alweer op de stoep staat, de bieden.",
      "speaker": "Zoom_Media",
      "sconf": 0.955
    }
    

    The service returns all JSON response content in the UTF-8 character set.

    Fields Description
    text Shows the final transcript of the audio containing all the words identified.
    result Represents an array with each word that has been identified. Each word is shown along with timestamps in milliseconds representing the start and endpoint, as well as word identification and confidence score. Consider the following example: [ "Je", 12046, 12286, 1 ]
    • "Je" - word identified
    • 12046 - start timestamp
    • 12286 - end timestamp
    • 1 (100%) - confidence score
    speaker Contains the name of a speaker when our service has identified someone. When our service does not identify a speaker the field will show "unk".
    sconf Service confidence score in identification of the speakers.