Add support for .xlsx filetype in file source

vegito22 · vegito22 · commit ad01ca3279f9 · 2024-08-22T13:18:14.000-07:00
diff --git a/llmstack/common/utils/text_extract.py b/llmstack/common/utils/text_extract.py
@@ -14,6 +14,7 @@
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.pptx import partition_pptx
 from unstructured.partition.text import partition_text
+from unstructured.partition.xlsx import partition_xlsx
 
 from llmstack.common.utils.audio_loader import (
     partition_audio,
@@ -105,6 +106,8 @@ def extract_text_elements(
                 metadata=ElementMetadata(filename=file_name),
             ),
         ]
+    elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+        elements = partition_xlsx(file=data_fp, chunking_strategy=chunking_strategy)
     elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
         elements = partition_docx(file=data_fp, chunking_strategy=chunking_strategy)
     elif mime_type == "application/msword":
diff --git a/llmstack/data/destinations/stores/pandas.py b/llmstack/data/destinations/stores/pandas.py
@@ -115,7 +115,7 @@ def add(self, document):
             document_dict = {"text": node.text, **extra_data}
             entry_dict = {
                 "id": node.id_,
-                **{mapping.source: document_dict[mapping.target] for mapping in self.mapping},
+                **{mapping.source: document_dict.get(mapping.target) for mapping in self.mapping},
             }
             self._dataframe = self._dataframe._append(entry_dict, ignore_index=True)
 
diff --git a/llmstack/data/sources/files/file.py b/llmstack/data/sources/files/file.py
@@ -5,8 +5,7 @@
 
 from pydantic import Field
 
-from llmstack.common.blocks.data.source import DataSourceEnvironmentSchema
-from llmstack.common.blocks.data.source.uri import Uri, UriConfiguration, UriInput
+from llmstack.common.utils.text_extract import extract_text_elements
 from llmstack.common.utils.utils import validate_parse_data_uri
 from llmstack.data.sources.base import BaseSource, DataDocument
 from llmstack.data.sources.utils import (
@@ -28,14 +27,12 @@ class FileSchema(BaseSource):
             "accepts": {
                 "application/pdf": [],
                 "application/json": [],
-                "audio/mpeg": [],
                 "application/rtf": [],
                 "text/plain": [],
                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [],
                 "application/vnd.openxmlformats-officedocument.presentationml.presentation": [],
-                "audio/mp3": [],
-                "video/mp4": [],
-                "video/webm": [],
+                "text/csv": [],
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [],
             },
         },
     )
@@ -78,11 +75,13 @@ def get_data_documents(self, **kwargs) -> List[DataDocument]:
     @classmethod
     def process_document(cls, document: DataDocument) -> DataDocument:
         data_uri = get_source_document_asset_by_objref(document.content)
-        result = Uri().process(
-            input=UriInput(env=DataSourceEnvironmentSchema(), uri=data_uri),
-            configuration=UriConfiguration(),
+        mime_type, file_name, file_data = validate_parse_data_uri(data_uri)
+        decoded_file_data = base64.b64decode(file_data)
+        elements = extract_text_elements(
+            mime_type=mime_type, data=decoded_file_data, file_name=file_name, extra_params=None
         )
-        text_content = result.documents[0].content_text
+        text_content = "".join([element.text for element in elements])
+
         text_data_uri = (
             f"data:text/plain;name={document.id_}_text.txt;base64,{base64.b64encode(text_content.encode()).decode()}"
         )
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -96,6 +96,7 @@ beautifulsoup4 = {version = "^4.12.2" }
 sqlalchemy = "^2.0.31"
 websockify = "^0.12.0"
 playwright = "1.45.0"
+openpyxl = "^3.1.5"
 
 [tool.poetry.group.data]
 

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@ def add(self, document):`
`115`	`115`	`document_dict = {"text": node.text, **extra_data}`
`116`	`116`	`entry_dict = {`
`117`	`117`	`"id": node.id_,`
`118`		`- **{mapping.source: document_dict[mapping.target] for mapping in self.mapping},`
	`118`	`+ **{mapping.source: document_dict.get(mapping.target) for mapping in self.mapping},`
`119`	`119`	`}`
`120`	`120`	`self._dataframe = self._dataframe._append(entry_dict, ignore_index=True)`
`121`	`121`