Skip to content
73 changes: 70 additions & 3 deletions aisuite-js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ npm pacakge - `npm i aisuite`
- **Streaming**: Real-time streaming responses with consistent API
- **Type Safety**: Full TypeScript support with comprehensive type definitions
- **Error Handling**: Unified error handling across providers
- **Speech-to-Text**: Automatic Speech Recognition (ASR) support with multiple providers (OpenAI Whisper, Deepgram)

## Installation

Expand All @@ -26,8 +27,12 @@ npm install aisuite
import { Client } from 'aisuite';

const client = new Client({
openai: { apiKey: process.env.OPENAI_API_KEY },
openai: {
apiKey: process.env.OPENAI_API_KEY,
audio: true // Enable Whisper ASR support
},
anthropic: { apiKey: process.env.ANTHROPIC_API_KEY },
deepgram: { apiKey: process.env.DEEPGRAM_API_KEY },
});

// Use any provider with identical interface
Expand Down Expand Up @@ -143,6 +148,42 @@ try {
}
```

### Speech-to-Text Transcription

```typescript
// Initialize client with audio support for OpenAI
const client = new Client({
openai: {
apiKey: process.env.OPENAI_API_KEY,
audio: true // Required for Whisper ASR
},
deepgram: { apiKey: process.env.DEEPGRAM_API_KEY }
});

// Using Deepgram
const deepgramResponse = await client.audio.transcriptions.create({
model: 'deepgram:nova-2',
file: audioBuffer, // Buffer containing audio data
language: 'en-US',
timestamps: true,
word_confidence: true,
speaker_labels: true,
});

// Using OpenAI Whisper (requires audio: true in config)
const openaiResponse = await client.audio.transcriptions.create({
model: 'openai:whisper-1',
file: audioBuffer,
language: 'en',
response_format: 'verbose_json',
temperature: 0,
timestamps: true,
});

console.log('Transcribed Text:', openaiResponse.text);
console.log('Words with timestamps:', openaiResponse.words);
```

### Error Handling

```typescript
Expand Down Expand Up @@ -174,10 +215,15 @@ const client = new Client({
apiKey: string;
baseURL?: string;
organization?: string;
audio?: boolean; // Enable Whisper ASR support
},
anthropic?: {
apiKey: string;
baseURL?: string;
},
deepgram?: {
apiKey: string;
baseURL?: string;
}
});
```
Expand All @@ -199,21 +245,42 @@ interface ChatCompletionRequest {
}
```

### Transcription Request

All ASR providers use a standard transcription request format with additional provider-specific parameters:

```typescript
interface TranscriptionRequest {
model: string; // "provider:model" format
file: Buffer; // Audio file as Buffer
language?: string; // Language code (e.g., "en", "en-US")
timestamps?: boolean; // Include word-level timestamps
[key: string]: any; // Additional provider-specific parameters:
// For OpenAI: See https://platform.openai.com/docs/api-reference/audio/createTranscription
// For Deepgram: See https://developers.deepgram.com/reference/speech-to-text-api/listen
}
```

### Helper Methods

```typescript
// List configured providers
// List all configured providers (including ASR)
client.listProviders(); // ['openai', 'anthropic']
client.listASRProviders(); // ['deepgram', 'openai']

// Check if a provider is configured
client.isProviderConfigured('openai'); // true
client.isASRProviderConfigured('deepgram'); // true
```

## Current Limitations

- Only OpenAI and Anthropic providers are currently supported (Gemini, Mistral, and Bedrock coming soon)
- Only OpenAI and Anthropic providers are currently supported for chat (Gemini, Mistral, and Bedrock coming soon)
- Tool calling requires handling tool responses manually
- Streaming tool calls require manual accumulation of arguments
- ASR support is limited to OpenAI Whisper (requires explicit audio configuration) and Deepgram
- Some provider-specific ASR features might require using provider-specific parameters
- OpenAI Whisper support requires additional `audio: true` configuration

## Development

Expand Down
73 changes: 73 additions & 0 deletions aisuite-js/examples/deepgram.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import { Client } from "../src";
import * as fs from "fs";
import * as path from "path";

async function main() {
// Initialize the client with Deepgram configuration
// Using Deepgram SDK v4.11.2 with the new createClient API
const client = new Client({
deepgram: {
apiKey: process.env.DEEPGRAM_API_KEY || "your-deepgram-api-key",
},
});

console.log("Available ASR providers:", client.listASRProviders());

// Example: Transcribe an audio file
try {
// Create a simple test audio file (you would replace this with your actual audio file)
const testAudioPath = path.join("test-audio.wav");

// Check if test file exists, if not create a placeholder
if (!fs.existsSync(testAudioPath)) {
console.log(
"Test audio file not found. Please provide a valid audio file for transcription."
);
console.log("Expected path:", testAudioPath);
return;
}

// Read the file as a buffer
const audioBuffer = fs.readFileSync(testAudioPath);

// Create the transcription request with the audio buffer
const result = await client.audio.transcriptions.create({
model: "deepgram:general",
file: audioBuffer,
language: "en-US",
timestamps: true,
word_confidence: true,
speaker_labels: true,
});

console.log("Transcription Result:");
console.log("Text:", result.text);
console.log("Language:", result.language);
console.log("Confidence:", result.confidence);

if (result.words && result.words.length > 0) {
console.log("\nWords with timestamps:");
result.words.slice(0, 5).forEach((word, index) => {
console.log(
`${index + 1}. "${word.text}" (${word.start}s - ${
word.end
}s, confidence: ${word.confidence})`
);
});
}

if (result.segments && result.segments.length > 0) {
console.log("\nSegments:");
result.segments.forEach((segment, index) => {
console.log(
`${index + 1}. [${segment.start}s - ${segment.end}s] ${segment.text}`
);
});
}
} catch (error) {
console.error("Error during transcription:", error);
}
}

main().catch(console.error);

69 changes: 69 additions & 0 deletions aisuite-js/examples/openai-asr.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import { Client } from "../src";
import * as fs from "fs";
import * as path from "path";

async function main() {
// Initialize the client with OpenAI configuration
const client = new Client({
openai: {
apiKey: process.env.OPENAI_API_KEY || "your-openai-api-key",
audio: true,
},
});

console.log("Available ASR providers:", client.listASRProviders());

// Example: Transcribe an audio file
try {
// Path to your audio file
const testAudioPath = path.join("test-audio.wav");

// Check if test file exists
if (!fs.existsSync(testAudioPath)) {
console.log(
"Test audio file not found. Please provide a valid audio file for transcription."
);
console.log("Expected path:", testAudioPath);
return;
}

const audioBuffer = fs.readFileSync(testAudioPath);

// Transcribe using OpenAI Whisper model
const result = await client.audio.transcriptions.create({
model: "openai:whisper-1",
file: audioBuffer,
language: "en",
response_format: "verbose_json",
temperature: 0,
timestamps: true,
});

console.log("Transcription Result:");
console.log("Text:", result.text);
console.log("Language:", result.language);
console.log("Confidence:", result.confidence);

if (result.words && result.words.length > 0) {
console.log("\nWords with timestamps:");
result.words.slice(0, 5).forEach((word, index) => {
console.log(
`${index + 1}. "${word.text}" (${word.start}s - ${word.end}s, confidence: ${word.confidence})`
);
});
}

if (result.segments && result.segments.length > 0) {
console.log("\nSegments:");
result.segments.slice(0, 3).forEach((segment, index) => {
console.log(
`${index + 1}. "${segment.text}" (${segment.start}s - ${segment.end}s)`
);
});
}
} catch (error) {
console.error("Error:", error);
}
}

main().catch(console.error);
3 changes: 3 additions & 0 deletions aisuite-js/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
"example:streaming": "tsx examples/streaming.ts",
"example:mistral": "tsx examples/mistral.ts",
"example:groq": "tsx examples/groq.ts",
"example:deepgram": "tsx examples/deepgram.ts",
"example:openai-asr": "tsx examples/openai-asr.ts",
"lint": "eslint src/**/*.ts",
"prepublishOnly": "npm run build",
"dev": "tsc --watch"
},
"dependencies": {
"@anthropic-ai/sdk": "^0.56.0",
"@deepgram/sdk": "^4.11.2",
"@mistralai/mistralai": "^0.1.3",
"groq-sdk": "^0.29.0",
"openai": "^4.0.0"
Expand Down
53 changes: 53 additions & 0 deletions aisuite-js/src/asr-providers/deepgram/adapters.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import { TranscriptionResult, Word, Segment } from "../../types";

export function adaptResponse(response: any): TranscriptionResult {
const words: Word[] = [];
const segments: Segment[] = [];

// Handle Deepgram response structure
if (response.results?.channels?.[0]?.alternatives?.[0]) {
const alternative = response.results.channels[0].alternatives[0];

// Extract words with timestamps and confidence
if (alternative.words) {
alternative.words.forEach((word: any) => {
words.push({
text: word.word,
start: word.start,
end: word.end,
confidence: word.confidence,
speaker: word.speaker?.toString(),
});
});
}

// Extract utterances/segments
if (response.results.utterances) {
response.results.utterances.forEach((utterance: any) => {
segments.push({
text: utterance.transcript,
start: utterance.start,
end: utterance.end,
speaker: utterance.speaker?.toString(),
});
});
}

return {
text: alternative.transcript,
language: response.metadata?.language || "unknown",
confidence: alternative.confidence,
words,
segments,
};
}

// Fallback for unexpected response structure
return {
text: response.transcript || "",
language: "unknown",
confidence: undefined,
words: [],
segments: [],
};
}
2 changes: 2 additions & 0 deletions aisuite-js/src/asr-providers/deepgram/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export { DeepgramASRProvider } from "./provider";
export type { DeepgramConfig } from "./types";
Loading