1
- import { MDocument } from "@mastra/rag"
2
- import { vectorStore , VectorStoreMetadata } from "../src/lib/vector-store" ;
1
+ import { MDocument } from "@mastra/rag" ;
2
+ import { vectorStore , VectorStoreMetadata } from "../src/lib/vector-store" ;
3
3
import { getContent } from "../src/lib/utils/content" ;
4
4
import { embedMany } from "ai" ;
5
5
import { openai } from "@ai-sdk/openai" ;
6
+ import { librariesWithFeatures } from "../src/lib/utils/process-libraries" ;
6
7
7
- const BATCH_SIZE = process . env . BATCH_SIZE ? parseInt ( process . env . BATCH_SIZE ) : 50 ;
8
+ const BATCH_SIZE = process . env . BATCH_SIZE
9
+ ? parseInt ( process . env . BATCH_SIZE )
10
+ : 50 ;
8
11
9
12
async function initVectorStore ( ) {
10
13
// Delete index content
@@ -16,9 +19,8 @@ async function initVectorStore() {
16
19
console . log ( "Index contents deleted" ) ;
17
20
} catch ( error ) { }
18
21
19
-
20
22
// Wait 3 seconds
21
- await new Promise ( resolve => setTimeout ( resolve , 3000 ) ) ;
23
+ await new Promise ( ( resolve ) => setTimeout ( resolve , 3000 ) ) ;
22
24
23
25
// Create index
24
26
console . log ( "Creating index..." ) ;
@@ -29,6 +31,12 @@ async function initVectorStore() {
29
31
30
32
console . log ( "Index created" ) ;
31
33
34
+ await addDocsToVectorStore ( ) ;
35
+ console . log ( "Done" ) ;
36
+ }
37
+
38
+
39
+ async function addDocsToVectorStore ( ) {
32
40
// Add documents
33
41
const docsPages = getContent ( ) ;
34
42
@@ -37,32 +45,63 @@ async function initVectorStore() {
37
45
// Process in batches
38
46
for ( let i = 0 ; i < docsPages . length ; i += BATCH_SIZE ) {
39
47
const batch = docsPages . slice ( i , i + BATCH_SIZE ) ;
40
- console . log ( `Processing batch ${ Math . floor ( i / BATCH_SIZE ) + 1 } of ${ Math . ceil ( docsPages . length / BATCH_SIZE ) } ` ) ;
41
- await Promise . all ( batch . map ( async page => {
42
- try {
43
- console . log ( `Processing page: ${ page . webPath } ` ) ;
44
- await processPage ( page ) ;
45
- } catch ( error ) {
46
- console . error ( `Error processing document ${ page . filePath } :` , error ) ;
47
- }
48
- } ) ) ;
48
+ console . log (
49
+ `Processing batch ${ Math . floor ( i / BATCH_SIZE ) + 1 } of ${ Math . ceil ( docsPages . length / BATCH_SIZE ) } `
50
+ ) ;
51
+ await Promise . all (
52
+ batch . map ( async ( page ) => {
53
+ try {
54
+ console . log ( `Processing page: ${ page . webPath } ` ) ;
55
+ await processPage ( {
56
+ markdown : page . body ,
57
+ webPath : page . webPath ,
58
+ layout : page . attributes . layout ,
59
+ title : page . attributes . title ,
60
+ description : page . attributes . description ,
61
+ } ) ;
62
+ } catch ( error ) {
63
+ console . error ( `Error processing document ${ page . filePath } :` , error ) ;
64
+ }
65
+ } )
66
+ ) ;
49
67
}
50
-
51
- console . log ( "Done" ) ;
52
68
}
53
69
54
- export async function processPage ( page : ReturnType < typeof getContent > [ 0 ] ) {
55
- const chunks = await getChunks ( page ) ;
70
+ export async function processPage ( {
71
+ markdown,
72
+ webPath,
73
+ layout,
74
+ title,
75
+ description,
76
+ library,
77
+ } : {
78
+ markdown : string ;
79
+ webPath : string ;
80
+ layout : string ;
81
+ title : string ;
82
+ description : string ;
83
+ library ?: string ;
84
+ } ) {
85
+ const chunks = await getChunks ( { markdown } ) ;
56
86
const embeddings = await embedDocsPage ( chunks ) ;
57
- await upsertDocsPageEmbeddings ( { page, chunks, embeddings } ) ;
58
- return { chunks, embeddings, page } ;
87
+ await upsertDocsPageEmbeddings ( {
88
+ webPath,
89
+ layout,
90
+ title,
91
+ description,
92
+ embeddings,
93
+ chunks,
94
+ library,
95
+ content : markdown ,
96
+ } ) ;
97
+ return { chunks, embeddings } ;
59
98
}
60
99
61
- export async function getChunks ( page : ReturnType < typeof getContent > [ 0 ] ) {
62
- const doc = MDocument . fromMarkdown ( page . body ) ;
100
+ export async function getChunks ( { markdown } : { markdown : string } ) {
101
+ const doc = MDocument . fromMarkdown ( markdown ) ;
63
102
const chunks = await doc . chunk ( {
64
103
strategy : "markdown" ,
65
- extract : { }
104
+ extract : { } ,
66
105
} ) ;
67
106
68
107
return chunks ;
@@ -71,30 +110,48 @@ export async function getChunks(page: ReturnType<typeof getContent>[0]) {
71
110
async function embedDocsPage ( chunks : Awaited < ReturnType < MDocument [ "chunk" ] > > ) {
72
111
const embeddingsResult = await embedMany ( {
73
112
model : openai . embedding ( "text-embedding-3-small" ) ,
74
- values : chunks . map ( chunk => chunk . text ) ,
113
+ values : chunks . map ( ( chunk ) => chunk . text ) ,
75
114
maxRetries : 3 ,
76
115
} ) ;
77
116
return embeddingsResult . embeddings ;
78
117
}
79
118
80
- async function upsertDocsPageEmbeddings ( { page, chunks, embeddings } : {
81
- page : ReturnType < typeof getContent > [ 0 ] ,
82
- chunks : Awaited < ReturnType < MDocument [ "chunk" ] > > ,
83
- embeddings : Awaited < ReturnType < typeof embedMany > > [ "embeddings" ]
119
+ async function upsertDocsPageEmbeddings ( {
120
+ webPath,
121
+ layout,
122
+ title,
123
+ description,
124
+ embeddings,
125
+ chunks,
126
+ content,
127
+ library,
128
+ } : {
129
+ webPath : string ;
130
+ layout : string ;
131
+ title : string ;
132
+ description : string ;
133
+ library ?: string ;
134
+ content : string ;
135
+ chunks : Awaited < ReturnType < MDocument [ "chunk" ] > > ;
136
+ embeddings : Awaited < ReturnType < typeof embedMany > > [ "embeddings" ] ;
84
137
} ) {
85
138
await vectorStore . upsert ( {
86
139
indexName : "docs" ,
87
140
vectors : embeddings ,
88
- metadata : chunks . map ( ( chunk , index ) => ( {
89
- text : chunk . text ,
90
- id : `${ page . webPath } _c_${ index } ` ,
91
- layout : page . attributes . layout ,
92
- title : page . attributes . title ,
93
- description : page . attributes . description ,
94
- createdAt : new Date ( ) . toISOString ( ) ,
95
- filePath : page . filePath ,
96
- webPath : page . webPath ,
97
- } ) satisfies VectorStoreMetadata ) ,
141
+ metadata : chunks . map (
142
+ ( chunk , index ) =>
143
+ ( {
144
+ text : chunk . text ,
145
+ id : `${ webPath } _c_${ index } ` ,
146
+ layout : layout ,
147
+ title : title ,
148
+ description : description ,
149
+ createdAt : new Date ( ) . toISOString ( ) ,
150
+ webPath : webPath ,
151
+ library : library ,
152
+ content : content ,
153
+ } ) satisfies VectorStoreMetadata
154
+ ) ,
98
155
} ) ;
99
156
}
100
157
0 commit comments