Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,12 @@ Converts the source document to HTML.
* `ignore_empty_paragraphs`: by default, empty paragraphs are ignored.
Set this option to `False` to preserve empty paragraphs in the output.

* `include_headers_and_footers`: by default, headers and footers are not included in the output.
Set this option to `True` to include them at the start and end of the output.

* `deduplicate_headers_and_footers`: by default, all headers and footers are included.
Set this option to `True` to only include unique headers and footers.

* `id_prefix`:
a string to prepend to any generated IDs,
such as those used by bookmarks, footnotes and endnotes.
Expand Down
110 changes: 97 additions & 13 deletions mammoth/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ def convert_document_element_to_html(element,
convert_image=None,
id_prefix=None,
output_format=None,
ignore_empty_paragraphs=True):
ignore_empty_paragraphs=True,
include_headers_and_footers=False,
deduplicate_headers_and_footers=False):

if style_map is None:
style_map = []
Expand All @@ -42,6 +44,8 @@ def convert_document_element_to_html(element,
convert_image=convert_image,
id_prefix=id_prefix,
ignore_empty_paragraphs=ignore_empty_paragraphs,
include_headers_and_footers=include_headers_and_footers,
deduplicate_headers_and_footers=deduplicate_headers_and_footers,
note_references=[],
comments=comments,
)
Expand All @@ -62,11 +66,22 @@ def copy(self, **kwargs):


class _DocumentConverter(documents.element_visitor(args=1)):
def __init__(self, messages, style_map, convert_image, id_prefix, ignore_empty_paragraphs, note_references, comments):
def __init__(self,
messages,
style_map,
convert_image,
id_prefix,
ignore_empty_paragraphs,
include_headers_and_footers,
deduplicate_headers_and_footers,
note_references,
comments):
self._messages = messages
self._style_map = style_map
self._id_prefix = id_prefix
self._ignore_empty_paragraphs = ignore_empty_paragraphs
self._include_headers_and_footers = include_headers_and_footers
self._deduplicate_headers_and_footers = deduplicate_headers_and_footers
self._note_references = note_references
self._referenced_comments = []
self._convert_image = convert_image
Expand All @@ -80,18 +95,13 @@ def visit_image(self, image, context):
return []

def visit_document(self, document, context):
headers = self.visit_headers(document.headers, context)
nodes = self._visit_all(document.children, context)
notes = [
document.notes.resolve(reference)
for reference in self._note_references
]
notes_list = html.element("ol", {}, self._visit_all(notes, context))
comments = html.element("dl", {}, [
html_node
for referenced_comment in self._referenced_comments
for html_node in self.visit_comment(referenced_comment, context)
])
return nodes + [notes_list, comments]
notes_list = self.visit_notes(document.notes, context)
comments = self.visit_comments(context)
footers = self.visit_footers(document.footers, context)

return headers + nodes + [notes_list, comments] + footers


def visit_paragraph(self, paragraph, context):
Expand Down Expand Up @@ -257,6 +267,12 @@ def visit_note(self, note, context):
html.element("li", {"id": self._note_html_id(note)}, note_body)
]

def visit_notes(self, notes, context):
resolved_notes = [
notes.resolve(reference)
for reference in self._note_references
]
return html.element("ol", {}, self._visit_all(resolved_notes, context))

def visit_comment_reference(self, reference, context):
def nodes():
Expand Down Expand Up @@ -300,6 +316,74 @@ def visit_comment(self, referenced_comment, context):
html.element("dd", {}, body),
]

def visit_comments(self, context):
return html.element("dl", {}, [
html_node
for referenced_comment in self._referenced_comments
for html_node in self.visit_comment(referenced_comment, context)
])

def visit_header(self, header, context):
return self._visit_all(header.children, context)

def visit_headers(self, headers, context):
if not self._include_headers_and_footers:
return []

all_headers = [
html_node
for h in headers
for html_node in self.visit_header(h, context)
]

if not self._deduplicate_headers_and_footers:
return [
html.element("header", {}, [h])
for h in headers
]

header_values = set()
filtered_headers = []
for h in all_headers:
if not h.to_text() in header_values:
filtered_headers.append(h)
header_values.add(h.to_text())

return [
html.element("header", {}, [f])
for f in filtered_headers
]

def visit_footer(self, footer, context):
return self._visit_all(footer.children, context)

def visit_footers(self, footers, context):
if not self._include_headers_and_footers:
return []

all_footers = [
html_node
for f in footers
for html_node in self.visit_footer(f, context)
]

if not self._deduplicate_headers_and_footers:
return [
html.element("footer", {}, [f])
for f in all_footers
]

footer_values = set()
filtered_footers = []
for h in all_footers:
if not h.to_text() in footer_values:
filtered_footers.append(h)
footer_values.add(h.to_text())

return [
html.element("footer", {}, [f])
for f in filtered_footers
]

def _visit_all(self, elements, context):
return [
Expand Down
22 changes: 20 additions & 2 deletions mammoth/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class HasChildren(Element):
class Document(HasChildren):
notes = cobble.field()
comments = cobble.field()
headers = cobble.field()
footers = cobble.field()

@cobble.data
class Paragraph(HasChildren):
Expand Down Expand Up @@ -97,12 +99,16 @@ class Image(Element):
open = cobble.field()


def document(children, notes=None, comments=None):
def document(children, notes=None, comments=None, headers=None, footers=None):
if notes is None:
notes = Notes({})
if comments is None:
comments = []
return Document(children, notes, comments=comments)
if headers is None:
headers = []
if footers is None:
footers = []
return Document(children, notes, comments=comments, headers=headers, footers=footers)

def paragraph(children, style_id=None, style_name=None, numbering=None, alignment=None, indent=None):
if indent is None:
Expand Down Expand Up @@ -252,5 +258,17 @@ class CommentReference(Element):

comment_reference = CommentReference

@cobble.data
class Header(HasChildren):
pass

header = Header

@cobble.data
class Footer(HasChildren):
pass

footer = Footer

def element_visitor(args):
return cobble.visitor(Element, args=args)
52 changes: 44 additions & 8 deletions mammoth/docx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from .. import results, lists, zips
from .document_xml import read_document_xml_element
from .header_xml import (read_header_xml_element, read_footer_xml_element)
from .content_types_xml import empty_content_types, read_content_types_xml_element
from .relationships_xml import read_relationships_xml_element, Relationships
from .numbering_xml import read_numbering_xml_element, Numbering
Expand All @@ -27,12 +28,14 @@ def read(fileobj):
zip_file,
part_paths=part_paths,
)

return results.combine([
_read_notes(read_part_with_body, part_paths),
_read_comments(read_part_with_body, part_paths),
_read_headers(read_part_with_body, part_paths),
_read_footers(read_part_with_body, part_paths)
]).bind(lambda referents:
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], part_paths=part_paths)
_read_document(zip_file, read_part_with_body, notes=referents[0], comments=referents[1], headers=referents[2], footers=referents[3], part_paths=part_paths)
)


Expand All @@ -43,6 +46,8 @@ class _PartPaths(object):
endnotes = cobble.field()
footnotes = cobble.field()
numbering = cobble.field()
headers = cobble.field()
footers = cobble.field()
styles = cobble.field()


Expand All @@ -55,21 +60,24 @@ def _find_part_paths(zip_file):
_find_relationships_path_for(document_filename),
)

def find(name):
def find(name, multiple=False):
return _find_part_path(
zip_file=zip_file,
relationships=document_relationships,
relationship_type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/" + name,
fallback_path="word/{0}.xml".format(name),
base_path=zips.split_path(document_filename)[0],
multiple=multiple
)

return _PartPaths(
main_document=document_filename,
comments=find("comments"),
endnotes=find("endnotes"),
footnotes=find("footnotes"),
numbering=find("numbering"),
headers=find("header", multiple=True),
footers=find("footer", multiple=True),
styles=find("styles"),
)

Expand All @@ -88,7 +96,7 @@ def _find_document_filename(zip_file, relationships):
raise IOError("Could not find main document part. Are you sure this is a valid .docx file?")


def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path):
def _find_part_path(zip_file, relationships, relationship_type, base_path, fallback_path, multiple=False):
targets = [
zips.join_path(base_path, target).lstrip("/")
for target in relationships.find_targets_by_type(relationship_type)
Expand All @@ -97,7 +105,7 @@ def _find_part_path(zip_file, relationships, relationship_type, base_path, fallb
if len(valid_targets) == 0:
return fallback_path
else:
return valid_targets[0]
return valid_targets if multiple else valid_targets[0]


def _read_notes(read_part_with_body, part_paths):
Expand All @@ -111,7 +119,7 @@ def _read_notes(read_part_with_body, part_paths):
lambda root, body_reader: read_endnotes_xml_element(root, body_reader=body_reader),
default=_empty_result,
)

return results.combine([footnotes, endnotes]).map(lists.flatten)


Expand All @@ -122,14 +130,42 @@ def _read_comments(read_part_with_body, part_paths):
default=_empty_result,
)

def _read_headers(read_part_with_body, part_paths):
if type(part_paths.headers) == str:
header_paths = [part_paths.headers]
else:
header_paths = part_paths.headers

headers = [
read_part_with_body(header,
lambda root, body_reader: read_header_xml_element(root, body_reader=body_reader),
default=_empty_result) for header in header_paths]
return [h for h in headers if h.value != []]


def _read_footers(read_part_with_body, part_paths):
if type(part_paths.footers) == str:
footer_paths = [part_paths.footers]
else:
footer_paths = part_paths.footers

footers = [
read_part_with_body(footer,
lambda root, body_reader: read_footer_xml_element(root, body_reader=body_reader),
default=_empty_result) for footer in footer_paths]

return [f for f in footers if f.value != []]


def _read_document(zip_file, read_part_with_body, notes, comments, part_paths):
def _read_document(zip_file, read_part_with_body, notes, comments, headers, footers, part_paths):
return read_part_with_body(
part_paths.main_document,
partial(
read_document_xml_element,
notes=notes,
comments=comments,
headers=headers,
footers=footers
),
)

Expand Down
8 changes: 6 additions & 2 deletions mammoth/docx/document_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ def read_document_xml_element(
element,
body_reader,
notes=None,
comments=None):
comments=None,
headers=None,
footers=None):

if notes is None:
notes = []
Expand All @@ -17,5 +19,7 @@ def read_document_xml_element(
.map(lambda children: documents.document(
children,
notes=documents.notes(notes),
comments=comments
comments=comments,
headers=headers,
footers=footers
))
9 changes: 9 additions & 0 deletions mammoth/docx/header_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import functools
from .. import documents

def _read_extremity(extremity, element, body_reader):
return body_reader.read_all(element.children) \
.map(lambda children: extremity(children))

read_header_xml_element = functools.partial(_read_extremity, documents.header)
read_footer_xml_element = functools.partial(_read_extremity, documents.footer)
5 changes: 5 additions & 0 deletions mammoth/html/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ class Node(object):
class TextNode(Node):
value = cobble.field()

def to_text(self):
return self.value


@cobble.data
class Tag(object):
Expand Down Expand Up @@ -52,6 +55,8 @@ def separator(self):
def is_void(self):
return not self.children and self.tag_name in self._VOID_TAG_NAMES

def to_text(self):
return "".join([s.to_text() for s in iter(self.children)])

@cobble.visitable
class ForceWrite(Node):
Expand Down
Loading