diff --git a/README.md b/README.md index ea986ff7..9f011415 100644 --- a/README.md +++ b/README.md @@ -285,6 +285,12 @@ Converts the source document to HTML. * `ignore_empty_paragraphs`: by default, empty paragraphs are ignored. Set this option to `False` to preserve empty paragraphs in the output. +* `include_headers_and_footers`: by default, headers and footers are not included in the output. + Set this option to `True` to include them at the start and end of the output. + +* `deduplicate_headers_and_footers`: by default, all headers and footers are included. + Set this option to `True` to only include unique headers and footers. + * `id_prefix`: a string to prepend to any generated IDs, such as those used by bookmarks, footnotes and endnotes. diff --git a/mammoth/conversion.py b/mammoth/conversion.py index eb514ec1..a320a32f 100644 --- a/mammoth/conversion.py +++ b/mammoth/conversion.py @@ -16,7 +16,9 @@ def convert_document_element_to_html(element, convert_image=None, id_prefix=None, output_format=None, - ignore_empty_paragraphs=True): + ignore_empty_paragraphs=True, + include_headers_and_footers=False, + deduplicate_headers_and_footers=False): if style_map is None: style_map = [] @@ -42,6 +44,8 @@ def convert_document_element_to_html(element, convert_image=convert_image, id_prefix=id_prefix, ignore_empty_paragraphs=ignore_empty_paragraphs, + include_headers_and_footers=include_headers_and_footers, + deduplicate_headers_and_footers=deduplicate_headers_and_footers, note_references=[], comments=comments, ) @@ -62,11 +66,22 @@ def copy(self, **kwargs): class _DocumentConverter(documents.element_visitor(args=1)): - def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments): + def __init__(self, + messages, + style_map, + convert_image, + id_prefix, + ignore_empty_paragraphs, + include_headers_and_footers, + deduplicate_headers_and_footers, + note_references, + comments): self._messages = messages self._style_map = style_map self._id_prefix = id_prefix self._ignore_empty_paragraphs = ignore_empty_paragraphs + self._include_headers_and_footers = include_headers_and_footers + self._deduplicate_headers_and_footers = deduplicate_headers_and_footers self._note_references = note_references self._referenced_comments = [] self._convert_image = convert_image @@ -80,18 +95,13 @@ def visit_image(self, image, context): return [] def visit_document(self, document, context): + headers = self.visit_headers(document.headers, context) nodes = self._visit_all(document.children, context) - notes = [ - document.notes.resolve(reference) - for reference in self._note_references - ] - notes_list = html.element("ol", {}, self._visit_all(notes, context)) - comments = html.element("dl", {}, [ - html_node - for referenced_comment in self._referenced_comments - for html_node in self.visit_comment(referenced_comment, context) - ]) - return nodes + [notes_list, comments] + notes_list = self.visit_notes(document.notes, context) + comments = self.visit_comments(context) + footers = self.visit_footers(document.footers, context) + + return headers + nodes + [notes_list, comments] + footers def visit_paragraph(self, paragraph, context): @@ -257,6 +267,12 @@ def visit_note(self, note, context): html.element("li", {"id": self._note_html_id(note)}, note_body) ] + def visit_notes(self, notes, context): + resolved_notes = [ + notes.resolve(reference) + for reference in self._note_references + ] + return html.element("ol", {}, self._visit_all(resolved_notes, context)) def visit_comment_reference(self, reference, context): def nodes(): @@ -300,6 +316,74 @@ def visit_comment(self, referenced_comment, context): html.element("dd", {}, body), ] + def visit_comments(self, context): + return html.element("dl", {}, [ + html_node + for referenced_comment in self._referenced_comments + for html_node in self.visit_comment(referenced_comment, context) + ]) + + def visit_header(self, header, context): + return self._visit_all(header.children, context) + + def visit_headers(self, headers, context): + if not self._include_headers_and_footers: + return [] + + all_headers = [ + html_node + for h in headers + for html_node in self.visit_header(h, context) + ] + + if not self._deduplicate_headers_and_footers: + return [ + html.element("header", {}, [h]) + for h in headers + ] + + header_values = set() + filtered_headers = [] + for h in all_headers: + if not h.to_text() in header_values: + filtered_headers.append(h) + header_values.add(h.to_text()) + + return [ + html.element("header", {}, [f]) + for f in filtered_headers + ] + + def visit_footer(self, footer, context): + return self._visit_all(footer.children, context) + + def visit_footers(self, footers, context): + if not self._include_headers_and_footers: + return [] + + all_footers = [ + html_node + for f in footers + for html_node in self.visit_footer(f, context) + ] + + if not self._deduplicate_headers_and_footers: + return [ + html.element("footer", {}, [f]) + for f in all_footers + ] + + footer_values = set() + filtered_footers = [] + for h in all_footers: + if not h.to_text() in footer_values: + filtered_footers.append(h) + footer_values.add(h.to_text()) + + return [ + html.element("footer", {}, [f]) + for f in filtered_footers + ] def _visit_all(self, elements, context): return [ diff --git a/mammoth/documents.py b/mammoth/documents.py index b00f1bbf..11f938e4 100644 --- a/mammoth/documents.py +++ b/mammoth/documents.py @@ -14,6 +14,8 @@ class HasChildren(Element): class Document(HasChildren): notes = cobble.field() comments = cobble.field() + headers = cobble.field() + footers = cobble.field() @cobble.data class Paragraph(HasChildren): @@ -97,12 +99,16 @@ class Image(Element): open = cobble.field() -def document(children, notes=None, comments=None): +def document(children, notes=None, comments=None, headers=None, footers=None): if notes is None: notes = Notes({}) if comments is None: comments = [] - return Document(children, notes, comments=comments) + if headers is None: + headers = [] + if footers is None: + footers = [] + return Document(children, notes, comments=comments, headers=headers, footers=footers) def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None): if indent is None: @@ -252,5 +258,17 @@ class CommentReference(Element): comment_reference = CommentReference +@cobble.data +class Header(HasChildren): + pass + +header = Header + +@cobble.data +class Footer(HasChildren): + pass + +footer = Footer + def element_visitor(args): return cobble.visitor(Element, args=args) diff --git a/mammoth/docx/__init__.py b/mammoth/docx/__init__.py index 75511d24..9b931204 100644 --- a/mammoth/docx/__init__.py +++ b/mammoth/docx/__init__.py @@ -5,6 +5,7 @@ from .. import results, lists, zips from .document_xml import read_document_xml_element +from .header_xml import (read_header_xml_element, read_footer_xml_element) from .content_types_xml import empty_content_types, read_content_types_xml_element from .relationships_xml import read_relationships_xml_element, Relationships from .numbering_xml import read_numbering_xml_element, Numbering @@ -27,12 +28,14 @@ def read(fileobj): zip_file, part_paths=part_paths, ) - + return results.combine([ _read_notes(read_part_with_body, part_paths), _read_comments(read_part_with_body, part_paths), + _read_headers(read_part_with_body, part_paths), + _read_footers(read_part_with_body, part_paths) ]).bind(lambda referents: - _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths) + _read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], headers=referents[2], footers=referents[3], part_paths=part_paths) ) @@ -43,6 +46,8 @@ class _PartPaths(object): endnotes = cobble.field() footnotes = cobble.field() numbering = cobble.field() + headers = cobble.field() + footers = cobble.field() styles = cobble.field() @@ -55,21 +60,24 @@ def _find_part_paths(zip_file): _find_relationships_path_for(document_filename), ) - def find(name): + def find(name, multiple=False): return _find_part_path( zip_file=zip_file, relationships=document_relationships, relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name, fallback_path="word/{0}.xml".format(name), base_path=zips.split_path(document_filename)[0], + multiple=multiple ) - + return _PartPaths( main_document=document_filename, comments=find("comments"), endnotes=find("endnotes"), footnotes=find("footnotes"), numbering=find("numbering"), + headers=find("header", multiple=True), + footers=find("footer", multiple=True), styles=find("styles"), ) @@ -88,7 +96,7 @@ def _find_document_filename(zip_file, relationships): raise IOError("Could not find main document part. Are you sure this is a valid .docx file?") -def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path): +def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path, multiple=False): targets = [ zips.join_path(base_path, target).lstrip("/") for target in relationships.find_targets_by_type(relationship_type) @@ -97,7 +105,7 @@ def _find_part_path(zip_file, relationships, relationship_type, base_path, fallb if len(valid_targets) == 0: return fallback_path else: - return valid_targets[0] + return valid_targets if multiple else valid_targets[0] def _read_notes(read_part_with_body, part_paths): @@ -111,7 +119,7 @@ def _read_notes(read_part_with_body, part_paths): lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader), default=_empty_result, ) - + return results.combine([footnotes, endnotes]).map(lists.flatten) @@ -122,14 +130,42 @@ def _read_comments(read_part_with_body, part_paths): default=_empty_result, ) +def _read_headers(read_part_with_body, part_paths): + if type(part_paths.headers) == str: + header_paths = [part_paths.headers] + else: + header_paths = part_paths.headers + + headers = [ + read_part_with_body(header, + lambda root, body_reader: read_header_xml_element(root, body_reader=body_reader), + default=_empty_result) for header in header_paths] + return [h for h in headers if h.value != []] + + +def _read_footers(read_part_with_body, part_paths): + if type(part_paths.footers) == str: + footer_paths = [part_paths.footers] + else: + footer_paths = part_paths.footers + + footers = [ + read_part_with_body(footer, + lambda root, body_reader: read_footer_xml_element(root, body_reader=body_reader), + default=_empty_result) for footer in footer_paths] + + return [f for f in footers if f.value != []] + -def _read_document(zip_file, read_part_with_body, notes, comments, part_paths): +def _read_document(zip_file, read_part_with_body, notes, comments, headers, footers, part_paths): return read_part_with_body( part_paths.main_document, partial( read_document_xml_element, notes=notes, comments=comments, + headers=headers, + footers=footers ), ) diff --git a/mammoth/docx/document_xml.py b/mammoth/docx/document_xml.py index e0d19a4e..4758a625 100644 --- a/mammoth/docx/document_xml.py +++ b/mammoth/docx/document_xml.py @@ -5,7 +5,9 @@ def read_document_xml_element( element, body_reader, notes=None, - comments=None): + comments=None, + headers=None, + footers=None): if notes is None: notes = [] @@ -17,5 +19,7 @@ def read_document_xml_element( .map(lambda children: documents.document( children, notes=documents.notes(notes), - comments=comments + comments=comments, + headers=headers, + footers=footers )) diff --git a/mammoth/docx/header_xml.py b/mammoth/docx/header_xml.py new file mode 100644 index 00000000..3f0dc519 --- /dev/null +++ b/mammoth/docx/header_xml.py @@ -0,0 +1,9 @@ +import functools +from .. import documents + +def _read_extremity(extremity, element, body_reader): + return body_reader.read_all(element.children) \ + .map(lambda children: extremity(children)) + +read_header_xml_element = functools.partial(_read_extremity, documents.header) +read_footer_xml_element = functools.partial(_read_extremity, documents.footer) \ No newline at end of file diff --git a/mammoth/html/nodes.py b/mammoth/html/nodes.py index 9c072599..a59d1a0a 100644 --- a/mammoth/html/nodes.py +++ b/mammoth/html/nodes.py @@ -9,6 +9,9 @@ class Node(object): class TextNode(Node): value = cobble.field() + def to_text(self): + return self.value + @cobble.data class Tag(object): @@ -52,6 +55,8 @@ def separator(self): def is_void(self): return not self.children and self.tag_name in self._VOID_TAG_NAMES + def to_text(self): + return "".join([s.to_text() for s in iter(self.children)]) @cobble.visitable class ForceWrite(Node): diff --git a/mammoth/options.py b/mammoth/options.py index 11a61b6c..55eb4ec7 100644 --- a/mammoth/options.py +++ b/mammoth/options.py @@ -19,6 +19,10 @@ def read_options(options): style_map += _default_style_map options["ignore_empty_paragraphs"] = options.get("ignore_empty_paragraphs", True) + + options["include_headers_and_footers"] = options.get("include_headers_and_footers", False) + options["deduplicate_headers_and_footers"] = options.get("deduplicate_headers_and_footers", False) + options["style_map"] = style_map return read_style_map_result.map(lambda _: options) diff --git a/mammoth/results.py b/mammoth/results.py index 59c3a421..ba1111fe 100644 --- a/mammoth/results.py +++ b/mammoth/results.py @@ -31,9 +31,13 @@ def combine(results): values = [] messages = [] for result in results: - values.append(result.value) - for message in result.messages: - messages.append(message) + if isinstance(result, list): + values.append([r.value for r in result]) + for r in result: + messages.extend(r.messages) + else: + values.append(result.value) + messages.extend(result.messages) return Result(values, messages)