Skip to content

Commit 46ee595

Browse files
committed
fix emoji trimming in embeddings processing
1 parent 089e97e commit 46ee595

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

src/tools/embeddings.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,14 @@ async function getBatchEmbeddingsWithRetry(
8282
const batchEmbeddings: number[][] = [];
8383
let batchTokens = 0;
8484
let retryCount = 0;
85-
let textsToProcess = [...batchTexts]; // Copy the original texts
85+
let textsToProcess = [...batchTexts].map(item => {
86+
if (typeof item === 'string') {
87+
return trimLeadingSymbols(item);
88+
} else {
89+
const key = Object.keys(item)[0];
90+
return key === 'text' ? { text: trimLeadingSymbols(item[key]) } : item;
91+
}
92+
}); // Copy the original texts
8693
let indexMap = new Map<number, number>(); // Map to keep track of original indices
8794

8895
// Initialize indexMap with original indices
@@ -241,4 +248,8 @@ function truncateInputString(input: string | Record<string, string>): string {
241248
} else {
242249
return Object.values(input)[0].slice(0, 50);
243250
}
251+
}
252+
253+
function trimLeadingSymbols(str: string): string {
254+
return str.replace(/^(?:[\u{1F000}-\u{1F9FF}]|[\u{2600}-\u{27BF}]|[\u{FE00}-\u{FE0F}]|[\u{DFE5}]|\s)+/gu, '');
244255
}

0 commit comments

Comments
 (0)