andrewyng · tavinashb · Aug 19, 2025 · Aug 20, 2025 · Aug 29, 2025 · Aug 29, 2025
diff --git a/aisuite-js/README.md b/aisuite-js/README.md
@@ -13,6 +13,7 @@ npm pacakge - `npm i aisuite`
 - **Streaming**: Real-time streaming responses with consistent API
 - **Type Safety**: Full TypeScript support with comprehensive type definitions
 - **Error Handling**: Unified error handling across providers
+- **Speech-to-Text**: Automatic Speech Recognition (ASR) support with multiple providers (OpenAI Whisper, Deepgram)
 
 ## Installation
 
@@ -26,8 +27,12 @@ npm install aisuite
 import { Client } from 'aisuite';
 
 const client = new Client({
-  openai: { apiKey: process.env.OPENAI_API_KEY },
+  openai: { 
+    apiKey: process.env.OPENAI_API_KEY,
+    audio: true  // Enable Whisper ASR support
+  },
   anthropic: { apiKey: process.env.ANTHROPIC_API_KEY },
+  deepgram: { apiKey: process.env.DEEPGRAM_API_KEY },
 });
 
 // Use any provider with identical interface
@@ -143,6 +148,42 @@ try {
 }
 ```
 
+### Speech-to-Text Transcription
+
+```typescript
+// Initialize client with audio support for OpenAI
+const client = new Client({
+  openai: { 
+    apiKey: process.env.OPENAI_API_KEY,
+    audio: true  // Required for Whisper ASR
+  },
+  deepgram: { apiKey: process.env.DEEPGRAM_API_KEY }
+});
+
+// Using Deepgram
+const deepgramResponse = await client.audio.transcriptions.create({
+  model: 'deepgram:nova-2',
+  file: audioBuffer,  // Buffer containing audio data
+  language: 'en-US',
+  timestamps: true,
+  word_confidence: true,
+  speaker_labels: true,
+});
+
+// Using OpenAI Whisper (requires audio: true in config)
+const openaiResponse = await client.audio.transcriptions.create({
+  model: 'openai:whisper-1',
+  file: audioBuffer,
+  language: 'en',
+  response_format: 'verbose_json',
+  temperature: 0,
+  timestamps: true,
+});
+
+console.log('Transcribed Text:', openaiResponse.text);
+console.log('Words with timestamps:', openaiResponse.words);
+```
+
 ### Error Handling
 
 ```typescript
@@ -174,10 +215,15 @@ const client = new Client({
     apiKey: string;
     baseURL?: string;
     organization?: string;
+    audio?: boolean;      // Enable Whisper ASR support
   },
   anthropic?: {
     apiKey: string;
     baseURL?: string;
+  },
+  deepgram?: {
+    apiKey: string;
+    baseURL?: string;
   }
 });
 ```
@@ -199,21 +245,42 @@ interface ChatCompletionRequest {
 }
 ```
 
+### Transcription Request
+
+All ASR providers use a standard transcription request format with additional provider-specific parameters:
+
+```typescript
+interface TranscriptionRequest {
+  model: string;              // "provider:model" format
+  file: Buffer;              // Audio file as Buffer
+  language?: string;         // Language code (e.g., "en", "en-US")
+  timestamps?: boolean;      // Include word-level timestamps
+  [key: string]: any;        // Additional provider-specific parameters:
+                            // For OpenAI: See https://platform.openai.com/docs/api-reference/audio/createTranscription
+                            // For Deepgram: See https://developers.deepgram.com/reference/speech-to-text-api/listen  
+}
+```
+
 ### Helper Methods
 
 ```typescript
-// List configured providers
+// List all configured providers (including ASR)
 client.listProviders(); // ['openai', 'anthropic']
+client.listASRProviders(); // ['deepgram', 'openai']
 
 // Check if a provider is configured
 client.isProviderConfigured('openai'); // true
+client.isASRProviderConfigured('deepgram'); // true
 ```
 
 ## Current Limitations
 
-- Only OpenAI and Anthropic providers are currently supported (Gemini, Mistral, and Bedrock coming soon)
+- Only OpenAI and Anthropic providers are currently supported for chat (Gemini, Mistral, and Bedrock coming soon)
 - Tool calling requires handling tool responses manually
 - Streaming tool calls require manual accumulation of arguments
+- ASR support is limited to OpenAI Whisper (requires explicit audio configuration) and Deepgram
+- Some provider-specific ASR features might require using provider-specific parameters
+- OpenAI Whisper support requires additional `audio: true` configuration
 
 ## Development
 

diff --git a/aisuite-js/examples/deepgram.ts b/aisuite-js/examples/deepgram.ts
@@ -0,0 +1,73 @@
+import { Client } from "../src";
+import * as fs from "fs";
+import * as path from "path";
+
+async function main() {
+  // Initialize the client with Deepgram configuration
+  // Using Deepgram SDK v4.11.2 with the new createClient API
+  const client = new Client({
+    deepgram: {
+      apiKey: process.env.DEEPGRAM_API_KEY || "your-deepgram-api-key",
+    },
+  });
+
+  console.log("Available ASR providers:", client.listASRProviders());
+
+  // Example: Transcribe an audio file
+  try {
+    // Create a simple test audio file (you would replace this with your actual audio file)
+    const testAudioPath = path.join("test-audio.wav");
+
+    // Check if test file exists, if not create a placeholder
+    if (!fs.existsSync(testAudioPath)) {
+      console.log(
+        "Test audio file not found. Please provide a valid audio file for transcription."
+      );
+      console.log("Expected path:", testAudioPath);
+      return;
+    }
+
+    // Read the file as a buffer
+    const audioBuffer = fs.readFileSync(testAudioPath);
+
+    // Create the transcription request with the audio buffer
+    const result = await client.audio.transcriptions.create({
+      model: "deepgram:general",
+      file: audioBuffer,
+      language: "en-US",
+      timestamps: true,
+      word_confidence: true,
+      speaker_labels: true,
+    });
+
+    console.log("Transcription Result:");
+    console.log("Text:", result.text);
+    console.log("Language:", result.language);
+    console.log("Confidence:", result.confidence);
+
+    if (result.words && result.words.length > 0) {
+      console.log("\nWords with timestamps:");
+      result.words.slice(0, 5).forEach((word, index) => {
+        console.log(
+          `${index + 1}. "${word.text}" (${word.start}s - ${
+            word.end
+          }s, confidence: ${word.confidence})`
+        );
+      });
+    }
+
+    if (result.segments && result.segments.length > 0) {
+      console.log("\nSegments:");
+      result.segments.forEach((segment, index) => {
+        console.log(
+          `${index + 1}. [${segment.start}s - ${segment.end}s] ${segment.text}`
+        );
+      });
+    }
+  } catch (error) {
+    console.error("Error during transcription:", error);
+  }
+}
+
+main().catch(console.error);
+
diff --git a/aisuite-js/examples/openai-asr.ts b/aisuite-js/examples/openai-asr.ts
@@ -0,0 +1,69 @@
+import { Client } from "../src";
+import * as fs from "fs";
+import * as path from "path";
+
+async function main() {
+  // Initialize the client with OpenAI configuration
+  const client = new Client({
+    openai: {
+      apiKey: process.env.OPENAI_API_KEY || "your-openai-api-key",
+      audio: true,
+    },
+  });
+
+  console.log("Available ASR providers:", client.listASRProviders());
+
+  // Example: Transcribe an audio file
+  try {
+    // Path to your audio file
+    const testAudioPath = path.join("test-audio.wav");
+
+    // Check if test file exists
+    if (!fs.existsSync(testAudioPath)) {
+      console.log(
+        "Test audio file not found. Please provide a valid audio file for transcription."
+      );
+      console.log("Expected path:", testAudioPath);
+      return;
+    }
+
+    const audioBuffer = fs.readFileSync(testAudioPath);
+
+    // Transcribe using OpenAI Whisper model
+    const result = await client.audio.transcriptions.create({
+      model: "openai:whisper-1",
+      file: audioBuffer,
+      language: "en",
+      response_format: "verbose_json",
+      temperature: 0,
+      timestamps: true,
+    });
+
+    console.log("Transcription Result:");
+    console.log("Text:", result.text);
+    console.log("Language:", result.language);
+    console.log("Confidence:", result.confidence);
+
+    if (result.words && result.words.length > 0) {
+      console.log("\nWords with timestamps:");
+      result.words.slice(0, 5).forEach((word, index) => {
+        console.log(
+          `${index + 1}. "${word.text}" (${word.start}s - ${word.end}s, confidence: ${word.confidence})`
+        );
+      });
+    }
+
+    if (result.segments && result.segments.length > 0) {
+      console.log("\nSegments:");
+      result.segments.slice(0, 3).forEach((segment, index) => {
+        console.log(
+          `${index + 1}. "${segment.text}" (${segment.start}s - ${segment.end}s)`
+        );
+      });
+    }
+  } catch (error) {
+    console.error("Error:", error);
+  }
+}
+
+main().catch(console.error);
diff --git a/aisuite-js/package.json b/aisuite-js/package.json
@@ -13,12 +13,15 @@
     "example:streaming": "tsx examples/streaming.ts",
     "example:mistral": "tsx examples/mistral.ts",
     "example:groq": "tsx examples/groq.ts",
+    "example:deepgram": "tsx examples/deepgram.ts",
+    "example:openai-asr": "tsx examples/openai-asr.ts",
     "lint": "eslint src/**/*.ts",
     "prepublishOnly": "npm run build",
     "dev": "tsc --watch"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.56.0",
+    "@deepgram/sdk": "^4.11.2",
     "@mistralai/mistralai": "^0.1.3",
     "groq-sdk": "^0.29.0",
     "openai": "^4.0.0"

diff --git a/aisuite-js/src/asr-providers/deepgram/adapters.ts b/aisuite-js/src/asr-providers/deepgram/adapters.ts
@@ -0,0 +1,53 @@
+import { TranscriptionResult, Word, Segment } from "../../types";
+
+export function adaptResponse(response: any): TranscriptionResult {
+  const words: Word[] = [];
+  const segments: Segment[] = [];
+
+  // Handle Deepgram response structure
+  if (response.results?.channels?.[0]?.alternatives?.[0]) {
+    const alternative = response.results.channels[0].alternatives[0];
+
+    // Extract words with timestamps and confidence
+    if (alternative.words) {
+      alternative.words.forEach((word: any) => {
+        words.push({
+          text: word.word,
+          start: word.start,
+          end: word.end,
+          confidence: word.confidence,
+          speaker: word.speaker?.toString(),
+        });
+      });
+    }
+
+    // Extract utterances/segments
+    if (response.results.utterances) {
+      response.results.utterances.forEach((utterance: any) => {
+        segments.push({
+          text: utterance.transcript,
+          start: utterance.start,
+          end: utterance.end,
+          speaker: utterance.speaker?.toString(),
+        });
+      });
+    }
+
+    return {
+      text: alternative.transcript,
+      language: response.metadata?.language || "unknown",
+      confidence: alternative.confidence,
+      words,
+      segments,
+    };
+  }
+
+  // Fallback for unexpected response structure
+  return {
+    text: response.transcript || "",
+    language: "unknown",
+    confidence: undefined,
+    words: [],
+    segments: [],
+  };
+}
diff --git a/aisuite-js/src/asr-providers/deepgram/index.ts b/aisuite-js/src/asr-providers/deepgram/index.ts
@@ -0,0 +1,2 @@
+export { DeepgramASRProvider } from "./provider";
+export type { DeepgramConfig } from "./types";
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		export { DeepgramASRProvider } from "./provider";
		export type { DeepgramConfig } from "./types";