A Swift package for parsing and extracting content from Apple iWork documents (Pages, Numbers, and Keynote). WorkKit provides a straightforward API to open iWork documents and traverse their content.
Add WorkKit to your project using Swift Package Manager:
dependencies: [
.package(url: "https://github.com/6over3/WorkKit.git", from: "1.0.0")
]
import WorkKit
let document = try IWorkParser.open(at: "/path/to/document.pages")
print("Document type: \(document.type)")
print("Format: \(document.format)")
Implement the IWorkDocumentVisitor
protocol to process document content:
struct TextExtractor: IWorkDocumentVisitor {
init(using document: IWorkDocument, with ocrProvider: OCRProvider?) {
// Initialize with document
}
func accept() async throws {
// Traverse document
}
func visitInlineElement(_ element: InlineElement) async {
switch element {
case .text(let text, let style, let hyperlink):
print(text)
case .image(let info, let spatialInfo, let ocrResult, let hyperlink):
print("Image: \(info.filename ?? "unknown")")
case .footnoteMarker(let footnote):
print("Footnote #\(footnote.number)")
default:
break
}
}
func willVisitTable(name: String?, rowCount: UInt32, columnCount: UInt32, spatialInfo: SpatialInfo) async {
print("Table: \(name ?? "untitled") (\(rowCount)×\(columnCount))")
}
func visitTableCell(row: Int, column: Int, content: TableCellContent) async {
switch content {
case .text(let text, _):
print(" [\(row),\(column)]: \(text)")
case .number(let value, _):
print(" [\(row),\(column)]: \(value)")
default:
break
}
}
}
let visitor = TextExtractor(using: document, with: nil)
try await visitor.accept()
if let thumbnail = document.preview(.thumbnail) {
let image = UIImage(data: thumbnail)
}
if let standard = document.preview(.standard) {
let image = UIImage(data: standard)
}
// Get all available previews
let previews = document.allPreviews()
for (name, data) in previews {
print("Preview: \(name)")
}
if let properties = document.metadata.properties {
print("Document UUID: \(properties.documentUUID ?? "unknown")")
print("File format version: \(properties.fileFormatVersion ?? "unknown")")
}
print("Build history: \(document.metadata.buildVersionHistory)")
Provide an OCR provider to extract text from images:
struct MyOCRProvider: OCRProvider {
func recognizeText(in imageData: Data, info: ImageInfo) async throws -> OCRResult {
// Implement text recognition
}
}
let ocrProvider = MyOCRProvider()
let visitor = MyVisitor(using: document, with: ocrProvider)
try await visitor.accept()
GNU Affero General Public License
Contributions are welcome. Please open an issue or submit a pull request.