Refactored the extractor segment and renamed it readFile. refactored field_segment to include an AbstractFieldSegment class.

travis-bauer · travis-bauer · commit 83717af377c2 · 2025-09-11T06:24:27.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,10 @@
   - Pulls "Annotated" typing from parameter names to create the Parameters section of the documentation.
   Makes for cleaner, more consistently up to date documentation.  The use of Annotated is optional.
 - Updated **isIn** and **isNotIn** to function list **isTrue** so that they need no always be filters.
+- Created an AbstractFieldSegment class and changed the field_segment decorator to use it.  This makes it easier
+  to create segments with consistent "field segment" behavior that require additional initialization.  
+- Renamed ExtractFile to ReadFile and refactored it to be descended from AbstractFileSegment
+- Renamed **extract** to **readFile** in chatterlang for consistence.
 
 ## 0.8.1
 ### Improvements
diff --git a/src/talkpipe/data/extraction.py b/src/talkpipe/data/extraction.py
@@ -3,8 +3,8 @@
 from typing import Union, Iterable, Annotated
 from pathlib import PosixPath
 from docx import Document
-from talkpipe.pipe.core import segment, AbstractSegment, field_segment
-from talkpipe.chatterlang.registry import register_segment, register_source
+from talkpipe.pipe.core import segment, AbstractFieldSegment, field_segment
+from talkpipe.chatterlang.registry import register_segment
 import logging
 from pathlib import Path
 import glob
@@ -107,8 +107,8 @@ def listFiles(patterns: Annotated[Iterable[str], "Iterable of file patterns or p
             else:
                 logging.debug(f"Skipping non-file: {match}")
 
-@register_segment("extract")
-class FileExtractor(AbstractSegment):
+@register_segment("readFile")
+class ReadFile(AbstractFieldSegment):
     """
     A class for extracting text content from different file types.
 
@@ -121,22 +121,13 @@ class FileExtractor(AbstractSegment):
 
     Methods:
         register_extractor(file_extension: str, extractor): Register a new file extractor for a specific extension.
-        extract(file_path: Union[str, PosixPath]): Extract content from a single file.
-        transform(input_iter): Transform an iterator of file paths into an iterator of their contents.
+        ProcessItem(file_path: Union[str, PosixPath]): Extract content from a single file.
 
-    Example:
-        >>> extractor = FileExtractor()
-        >>> content = extractor.extract("document.txt")
-        >>> for text in extractor.transform(["file1.txt", "file2.docx"]):
-        ...     print(text)
-
-    Raises:
-        Exception: When trying to extract content from a file with an unsupported extension.
     """
     _extractors:dict
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, field: str = None, set_as: str = None):
+        super().__init__(field=field, set_as=set_as)
         logging.debug("Initializing FileExtractor")
         self._extractors = {}
         self.register_extractor("txt", readtxt())
@@ -147,14 +138,11 @@ def register_extractor(self, file_extension:str, extractor):
         logging.debug(f"Registering extractor for extension: {file_extension}")
         self._extractors[file_extension] = extractor
 
-    def extract(self, file_path:Union[str, PosixPath]):
+    def process_value(self, file_path:Union[str, PosixPath]):
         file_extension = file_path.split(".")[-1] if isinstance(file_path, str) else file_path.suffix[1:]
         if file_extension not in self._extractors:
             logging.error(f"Unsupported file extension: {file_extension}")
             raise Exception(f"File extension {file_extension} not supported")
         logging.debug(f"Extracting content from file: {file_path}")
         return next(self._extractors[file_extension]([file_path]))
 
-    def transform(self, input_iter):
-        for file_path in input_iter:
-            yield self.extract(file_path)
diff --git a/src/talkpipe/pipe/core.py b/src/talkpipe/pipe/core.py
@@ -398,27 +398,18 @@ def field_segment(*decorator_args, **decorator_kwargs):
         set_as: The field name to append the result as
     """
     def decorator(func):
-        class FieldSegment(AbstractSegment):
+        class FieldSegment(AbstractFieldSegment):
             def __init__(self, *init_args, **init_kwargs):
-                super().__init__()
                 merged_kwargs = {**decorator_kwargs, **init_kwargs}
-                self.field = merged_kwargs.get('field')
-                self.set_as = merged_kwargs.get('set_as')
-                merged_kwargs.pop('field', None)
-                merged_kwargs.pop('set_as', None)
+                field = merged_kwargs.pop('field', None)
+                set_as = merged_kwargs.pop('set_as', None)
+                super().__init__(field=field, set_as=set_as)
                 self._func = lambda x: func(x, *init_args, **merged_kwargs)
                 # Store reference to original function for documentation access
                 self._original_func = func
             
-            def transform(self, input_iter):
-                for item in input_iter:
-                    value = data_manipulation.extract_property(item, self.field) if self.field else item
-                    result = self._func(value)
-                    if self.set_as:
-                        item[self.set_as] = result
-                        yield item
-                    else:
-                        yield result
+            def process_value(self, value):
+                return self._func(value)
         
         FieldSegment.__name__ = f"{func.__name__}FieldSegment"
         # Preserve original function's docstring and metadata
@@ -430,6 +421,56 @@ def transform(self, input_iter):
         return decorator(decorator_args[0])
     return decorator
 
+class AbstractFieldSegment(AbstractSegment[T, U]):
+    """Abstract base class for segments that process a single field and optionally set results.
+    
+    This class handles the 'field' and 'set_as' parameters that are commonly used
+    in field-processing segments, making it easy for descendant classes to have
+    their own constructors while still supporting field extraction and result setting.
+    
+    Args:
+        field: The field to extract from each item (optional)
+        set_as: The field name to set/append the result as (optional)
+    """
+    
+    def __init__(self, field: str = None, set_as: str = None):
+        super().__init__()
+        self.field = field
+        self.set_as = set_as
+    
+    @abstractmethod
+    def process_value(self, value: Any) -> Any:
+        """Process the extracted field value or the entire item.
+        
+        This method must be implemented by subclasses to define how to process
+        the extracted field value (or entire item if no field is specified).
+        
+        Args:
+            value: The field value extracted from the item, or the entire item
+                  if no field was specified
+        
+        Returns:
+            Any: The processed result
+        """
+        pass
+    
+    def transform(self, input_iter: Iterable[T]) -> Iterator[U]:
+        """Transform input items by processing field values.
+        
+        For each item:
+        1. Extract the specified field value (or use entire item if no field)
+        2. Process the value using process_value()
+        3. Either yield the result directly or set it on the item and yield the item
+        """
+        for item in input_iter:
+            value = data_manipulation.extract_property(item, self.field) if self.field else item
+            result = self.process_value(value)
+            if self.set_as:
+                item[self.set_as] = result
+                yield item
+            else:
+                yield result
+
 class Pipeline(AbstractSegment):
     """A pipeline is a sequence of operations.  Each operation draws from the output of the previous operation
     and yields items to the next operation.  The pipeline can be executed by calling it with an input iterator.    
diff --git a/src/talkpipe/search/abstract.py b/src/talkpipe/search/abstract.py
@@ -1,11 +1,7 @@
-from typing import List, Dict, Any, Optional, Tuple, Union, Protocol
+from typing import List, Optional, Tuple, Union, Protocol
 from pydantic import BaseModel
-import numpy as np
 
-# Type aliases
-VectorLike = Union[List[float], np.ndarray]
-Document = Dict[str, str]
-DocID = str
+from talkpipe.util.data_manipulation import VectorLike, Document, DocID
 
 class SearchResult(BaseModel):
     score: float
diff --git a/src/talkpipe/search/simplevectordb.py b/src/talkpipe/search/simplevectordb.py
@@ -14,8 +14,8 @@
 from talkpipe.pipe.core import segment
 from talkpipe.pipe import field_segment
 from talkpipe.chatterlang import register_segment
-from .abstract import VectorLike, DocumentStore, VectorAddable, VectorSearchable, SearchResult, Document, DocID
-from talkpipe.util.data_manipulation import extract_property, toDict
+from .abstract import DocumentStore, VectorAddable, VectorSearchable, SearchResult
+from talkpipe.util.data_manipulation import DocID, Document, VectorLike, extract_property, toDict
 
 logger = logging.getLogger(__name__)
 
diff --git a/src/talkpipe/search/whoosh.py b/src/talkpipe/search/whoosh.py
@@ -10,17 +10,15 @@
 from whoosh.writing import LockError
 from talkpipe.pipe import segment, field_segment
 from talkpipe.chatterlang import register_segment
-from talkpipe.util.data_manipulation import toDict, extract_property
+from talkpipe.util.data_manipulation import DocID, Document, toDict, extract_property
 from talkpipe.util.config import parse_key_value_str
 import time
 
 from .abstract import (
     SearchResult, 
     DocumentStore, 
     MutableDocumentStore, 
-    TextSearchable,
-    Document,
-    DocID
+    TextSearchable
 )
 
 logger = logging.getLogger(__name__)
diff --git a/src/talkpipe/util/data_manipulation.py b/src/talkpipe/util/data_manipulation.py
@@ -1,12 +1,20 @@
+from typing import Union
 import logging
 import re
 import inspect
 import json
 import textwrap
 from types import MappingProxyType
-from typing import Any, Set
+from typing import Any, Dict, List, Set
+
+import numpy as np
 from talkpipe.util.config import parse_key_value_str
 
+# Type aliases
+VectorLike = Union[List[float], np.ndarray]
+Document = Dict[str, str]
+DocID = str
+
 logger = logging.getLogger(__name__)
 
 def get_all_attributes(obj: Any, skip_packages: tuple = ('pydantic',), visited: Set = None,
@@ -359,4 +367,6 @@ def lambda_function(item: Any) -> Any:
                 raise
             return None
 
-    return lambda_function
+    return lambda_function
+
+
diff --git a/tests/talkpipe/data/test_extraction.py b/tests/talkpipe/data/test_extraction.py
@@ -1,5 +1,5 @@
 import pytest
-from talkpipe.data.extraction import FileExtractor, readtxt, readdocx, listFiles
+from talkpipe.data.extraction import ReadFile, readtxt, readdocx, listFiles
 
 def test_readdocx(tmp_path):
     # Test reading individual docx file using existing test file
@@ -53,7 +53,7 @@ def test_readtxt(tmp_path):
         next(readtxt()([tmp_path / "nonexistent_dir"]))
 
 def test_FileExtractor(tmp_path):
-    fe = FileExtractor()
+    fe = ReadFile()
 
     with open(tmp_path / "test.txt", "w") as file:
         file.write("Hello World")