Skip to content

Commit ad01ca3

Browse files
committed
Add support for .xlsx filetype in file source
1 parent af5c7fc commit ad01ca3

File tree

5 files changed

+40
-12
lines changed

5 files changed

+40
-12
lines changed

llmstack/common/utils/text_extract.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from unstructured.partition.pdf import partition_pdf
1515
from unstructured.partition.pptx import partition_pptx
1616
from unstructured.partition.text import partition_text
17+
from unstructured.partition.xlsx import partition_xlsx
1718

1819
from llmstack.common.utils.audio_loader import (
1920
partition_audio,
@@ -105,6 +106,8 @@ def extract_text_elements(
105106
metadata=ElementMetadata(filename=file_name),
106107
),
107108
]
109+
elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
110+
elements = partition_xlsx(file=data_fp, chunking_strategy=chunking_strategy)
108111
elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
109112
elements = partition_docx(file=data_fp, chunking_strategy=chunking_strategy)
110113
elif mime_type == "application/msword":

llmstack/data/destinations/stores/pandas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def add(self, document):
115115
document_dict = {"text": node.text, **extra_data}
116116
entry_dict = {
117117
"id": node.id_,
118-
**{mapping.source: document_dict[mapping.target] for mapping in self.mapping},
118+
**{mapping.source: document_dict.get(mapping.target) for mapping in self.mapping},
119119
}
120120
self._dataframe = self._dataframe._append(entry_dict, ignore_index=True)
121121

llmstack/data/sources/files/file.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55

66
from pydantic import Field
77

8-
from llmstack.common.blocks.data.source import DataSourceEnvironmentSchema
9-
from llmstack.common.blocks.data.source.uri import Uri, UriConfiguration, UriInput
8+
from llmstack.common.utils.text_extract import extract_text_elements
109
from llmstack.common.utils.utils import validate_parse_data_uri
1110
from llmstack.data.sources.base import BaseSource, DataDocument
1211
from llmstack.data.sources.utils import (
@@ -28,14 +27,12 @@ class FileSchema(BaseSource):
2827
"accepts": {
2928
"application/pdf": [],
3029
"application/json": [],
31-
"audio/mpeg": [],
3230
"application/rtf": [],
3331
"text/plain": [],
3432
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": [],
3533
"application/vnd.openxmlformats-officedocument.presentationml.presentation": [],
36-
"audio/mp3": [],
37-
"video/mp4": [],
38-
"video/webm": [],
34+
"text/csv": [],
35+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [],
3936
},
4037
},
4138
)
@@ -78,11 +75,13 @@ def get_data_documents(self, **kwargs) -> List[DataDocument]:
7875
@classmethod
7976
def process_document(cls, document: DataDocument) -> DataDocument:
8077
data_uri = get_source_document_asset_by_objref(document.content)
81-
result = Uri().process(
82-
input=UriInput(env=DataSourceEnvironmentSchema(), uri=data_uri),
83-
configuration=UriConfiguration(),
78+
mime_type, file_name, file_data = validate_parse_data_uri(data_uri)
79+
decoded_file_data = base64.b64decode(file_data)
80+
elements = extract_text_elements(
81+
mime_type=mime_type, data=decoded_file_data, file_name=file_name, extra_params=None
8482
)
85-
text_content = result.documents[0].content_text
83+
text_content = "".join([element.text for element in elements])
84+
8685
text_data_uri = (
8786
f"data:text/plain;name={document.id_}_text.txt;base64,{base64.b64encode(text_content.encode()).decode()}"
8887
)

poetry.lock

Lines changed: 26 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ beautifulsoup4 = {version = "^4.12.2" }
9696
sqlalchemy = "^2.0.31"
9797
websockify = "^0.12.0"
9898
playwright = "1.45.0"
99+
openpyxl = "^3.1.5"
99100

100101
[tool.poetry.group.data]
101102

0 commit comments

Comments
 (0)