Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
/.tox
/MANIFEST
/build
*.iml
6 changes: 6 additions & 0 deletions mammoth/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class TableCell(HasChildren):
class Break(Element):
break_type = cobble.field()

@cobble.data
class Size(object):
width = cobble.field()
height = cobble.field()

line_break = Break("line")
page_break = Break("page")
column_break = Break("column")
Expand All @@ -97,6 +102,7 @@ class Image(Element):
alt_text = cobble.field()
content_type = cobble.field()
open = cobble.field()
size = cobble.field(default=None)


def document(children, notes=None, comments=None):
Expand Down
54 changes: 44 additions & 10 deletions mammoth/docx/body_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from .styles_xml import Styles
from .uris import replace_fragment, uri_to_zip_entry_name

EMU_PER_PIXEL = 9525

if sys.version_info >= (3, ):
unichr = chr

Expand Down Expand Up @@ -423,23 +425,32 @@ def inline(element):
alt_text = properties.get("descr")
else:
alt_text = properties.get("title")
dimensions = element.find_child_or_null("wp:extent").attributes
size = documents.Size(
width=str(_emu_to_pixel(dimensions.get("cx"))),
height=str(_emu_to_pixel(dimensions.get("cy")))
)

blips = element.find_children("a:graphic") \
.find_children("a:graphicData") \
.find_children("pic:pic") \
.find_children("pic:blipFill") \
.find_children("a:blip")
return _read_blips(blips, alt_text)
return _read_blips(blips, alt_text, size)

def _read_blips(blips, alt_text):
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
def _emu_to_pixel(emu):
return int(round(float(emu) / EMU_PER_PIXEL))

def _read_blip(element, alt_text):
return _read_image(lambda: _find_blip_image(element), alt_text)
def _read_blips(blips, alt_text, size):
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text, size), blips))

def _read_image(find_image, alt_text):
def _read_blip(element, alt_text, size):
return _read_image(lambda: _find_blip_image(element), alt_text, size)

def _read_image(find_image, alt_text, size=None):
image_path, open_image = find_image()
content_type = content_types.find_content_type(image_path)
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
image = documents.image(alt_text=alt_text, content_type=content_type, size=size, open=open_image)

if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
messages = []
Expand Down Expand Up @@ -478,14 +489,37 @@ def open_image():

return image_path, open_image

def read_imagedata(element):
def shape(element):
if len(element.children) == 1:
imagedata = element.find_child("v:imagedata")
if imagedata:
size = _read_shape_size(element)
return read_imagedata(imagedata, size)
return read_child_elements(element)

def _read_shape_size(element):
style_attribute = element.attributes.get("style")
if not style_attribute:
return None
style = style_attribute.split(";")
width = _extract_size_from_style("width", style)
height = _extract_size_from_style("height", style)
size = documents.Size(width=width, height=height)
return size

def _extract_size_from_style(style_name, style):
with_column = "{}:".format(style_name)
raw_size = next(iter(filter(lambda s: s.startswith(with_column), style)))
return raw_size.replace(with_column, "")

def read_imagedata(element, style=None):
relationship_id = element.attributes.get("r:id")
if relationship_id is None:
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
return _empty_result_with_message(warning)
else:
title = element.attributes.get("o:title")
return _read_image(lambda: _find_embedded_image(relationship_id), title)
return _read_image(lambda: _find_embedded_image(relationship_id), title, style)

def note_reference_reader(note_type):
def note_reference(element):
Expand Down Expand Up @@ -522,7 +556,7 @@ def read_sdt(element):
"v:group": read_child_elements,
"v:rect": read_child_elements,
"v:roundrect": read_child_elements,
"v:shape": read_child_elements,
"v:shape": shape,
"v:textbox": read_child_elements,
"w:txbxContent": read_child_elements,
"w:pict": pict,
Expand Down
3 changes: 3 additions & 0 deletions mammoth/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ def convert_image(image):
attributes = func(image).copy()
if image.alt_text:
attributes["alt"] = image.alt_text
if image.size:
attributes["width"] = image.size.width
attributes["height"] = image.size.height

return [html.element("img", attributes)]

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def read(fname):
name='mammoth',
version='1.4.15',
description='Convert Word documents from docx to simple and clean HTML and Markdown',
long_description=read("README"),
long_description=read("README.md"),
author='Michael Williamson',
author_email='[email protected]',
url='http://github.com/mwilliamson/python-mammoth',
Expand Down
4 changes: 2 additions & 2 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def html_is_written_to_file_if_output_file_is_set():
def inline_images_are_included_in_output_if_writing_to_single_file():
docx_path = test_path("tiny-picture.docx")
result = _local.run(["mammoth", docx_path])
assert_equal(b"""<p><img src="data:image/png;base64,""" + _image_base_64 + b"""" /></p>""", result.output)
assert_equal(b"""<p><img height="10" src="data:image/png;base64,""" + _image_base_64 + b"""" width="10" /></p>""", result.output)


@istest
Expand All @@ -50,7 +50,7 @@ def images_are_written_to_separate_files_if_output_dir_is_set():
assert_equal(b"", result.stderr_output)
assert_equal(b"", result.output)
with open(output_path) as output_file:
assert_equal("""<p><img src="1.png" /></p>""", output_file.read())
assert_equal("""<p><img height="10" src="1.png" width="10" /></p>""", output_file.read())

with open(image_path, "rb") as image_file:
assert_equal(_image_base_64, base64.b64encode(image_file.read()))
Expand Down
13 changes: 13 additions & 0 deletions tests/conversion_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,19 @@ def images_have_alt_tags_if_available():
image_html = parse_xml(io.StringIO(result.value))
assert_equal('It\'s a hat', image_html.attributes["alt"])

@istest
def images_have_width_and_height_tags_if_available():
image = documents.image(
alt_text=None,
content_type="image/png",
size=documents.Size(width="42", height="51"),
open=lambda: io.BytesIO(b"abc")
)
result = convert_document_element_to_html(image)
image_html = parse_xml(io.StringIO(result.value))
assert_equal('42', image_html.attributes["width"])
assert_equal('51', image_html.attributes["height"])


@istest
def can_define_custom_conversion_for_images():
Expand Down
89 changes: 74 additions & 15 deletions tests/docx/body_xml_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys

from precisely import assert_that, is_sequence
from nose.tools import istest, assert_equal
from nose.tools import istest, assert_equal, assert_is_none
from nose_parameterized import parameterized, param
import funk

Expand Down Expand Up @@ -961,39 +961,92 @@ class ImageTests(object):
IMAGE_RELATIONSHIP_ID = "rId5"

def _read_embedded_image(self, element):
return self._read_embedded_images(element)[0]

def _read_embedded_images(self, element):
relationships = Relationships([
_image_relationship(self.IMAGE_RELATIONSHIP_ID, "media/hat.png"),
])

mocks = funk.Mocks()
docx_file = mocks.mock()
funk.allows(docx_file).open("word/media/hat.png").returns(io.BytesIO(self.IMAGE_BYTES))

content_types = mocks.mock()
funk.allows(content_types).find_content_type("word/media/hat.png").returns("image/png")

return _read_and_get_document_xml_element(
return _read_and_get_document_xml_elements(
element,
content_types=content_types,
relationships=relationships,
docx_file=docx_file,
)

@istest
def can_read_imagedata_elements_with_rid_attribute(self):
imagedata_element = xml_element("v:imagedata", {
"r:id": self.IMAGE_RELATIONSHIP_ID,
"o:title": "It's a hat"
})
def can_read_shape_elements_with_rid_and_size_attributes(self):
shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
xml_element("v:imagedata", {
"r:id": self.IMAGE_RELATIONSHIP_ID,
"o:title": "It's a hat"
})
])

image = self._read_embedded_image(imagedata_element)
image = self._read_embedded_image(shape_element)

assert_equal(documents.Image, type(image))
assert_equal("It's a hat", image.alt_text)
assert_equal("image/png", image.content_type)
assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
with image.open() as image_file:
assert_equal(self.IMAGE_BYTES, image_file.read())

@istest
def cannot_resize_shape_with_multiple_nodes(self):
shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
xml_element("v:imagedata", {
"r:id": self.IMAGE_RELATIONSHIP_ID,
"o:title": "It's a hat"
}),
xml_element("v:textbox", {}, [
xml_element("w:txbxContent", {}, [
_paragraph_with_style_id("textbox-content")
])
])
])

nodes = self._read_embedded_images(shape_element)

assert_equal(2, len(nodes))
image_node = nodes[0]
assert_equal(documents.Image, type(image_node))
assert_equal("It's a hat", image_node.alt_text)
assert_is_none(image_node.size)

@istest
def can_read_shape_elements_with_unused_style_elements(self):
shape_element = xml_element("v:shape", {"style": "width:31.5pt;position:absolute;height:38.25pt"}, [
xml_element("v:imagedata", {
"r:id": self.IMAGE_RELATIONSHIP_ID,
"o:title": "It's a hat"
})
])

image = self._read_embedded_image(shape_element)

assert_equal(documents.Image, type(image))
assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)

@istest
def can_read_shape_elements_with_inch_size_attributes(self):
shape_element = xml_element("v:shape", {"style": "width:0.58in;height:0.708in"}, [
xml_element("v:imagedata", {
"r:id": self.IMAGE_RELATIONSHIP_ID,
"o:title": "It's a hat"
})
])

image = self._read_embedded_image(shape_element)

assert_equal(documents.Image, type(image))
assert_equal(documents.Size(width="0.58in", height="0.708in"), image.size)

@istest
def when_imagedata_element_has_no_relationship_id_then_it_is_ignored_with_warning(self):
imagedata_element = xml_element("v:imagedata")
Expand All @@ -1009,13 +1062,15 @@ def can_read_inline_pictures(self):
drawing_element = _create_inline_image(
blip=_embedded_blip(self.IMAGE_RELATIONSHIP_ID),
description="It's a hat",
extent=(9525, 19000)
)

image = self._read_embedded_image(drawing_element)

assert_equal(documents.Image, type(image))
assert_equal("It's a hat", image.alt_text)
assert_equal("image/png", image.content_type)
assert_equal(documents.Size(width="1", height="2"), image.size)
with image.open() as image_file:
assert_equal(self.IMAGE_BYTES, image_file.read())

Expand Down Expand Up @@ -1307,9 +1362,9 @@ def _text_element(value):
return xml_element("w:t", {}, [xml_text(value)])


def _create_inline_image(blip, description=None, title=None):
def _create_inline_image(blip, description=None, title=None, extent=None):
return xml_element("w:drawing", {}, [
xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title))
xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title, extent=extent))
])


Expand All @@ -1319,15 +1374,19 @@ def _create_anchored_image(description, blip):
])


def _create_image_elements(blip, description=None, title=None):
def _create_image_elements(blip, description=None, title=None, extent=None):
properties = {}
if description is not None:
properties["descr"] = description
if title is not None:
properties["title"] = title

extent = {
"cx": extent[0] if extent else "0",
"cy": extent[1] if extent else "0"
}
return [
xml_element("wp:docPr", properties),
xml_element("wp:extent", extent),
xml_element("a:graphic", {}, [
xml_element("a:graphicData", {}, [
xml_element("pic:pic", {}, [
Expand Down
7 changes: 6 additions & 1 deletion tests/images_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ def data_uri_encodes_images_in_base64():
image = mammoth.documents.Image(
alt_text=None,
content_type="image/jpeg",
size=mammoth.documents.Size(width="800", height="600"),
open=lambda: io.BytesIO(image_bytes),
)

result = mammoth.images.data_uri(image)

assert_that(result, contains(
has_properties(attributes={"src": ""}),
has_properties(attributes={
"src": "",
"width": "800",
"height": "600",
}),
))
8 changes: 4 additions & 4 deletions tests/mammoth_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,23 +112,23 @@ def warning_if_style_mapping_is_not_understood():
def inline_images_referenced_by_path_relative_to_part_are_included_in_output():
with open(test_path("tiny-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
assert_equal("""<p><img src="" /></p>""", result.value)
assert_equal("""<p><img height="10" src="" width="10" /></p>""", result.value)
assert_equal([], result.messages)


@istest
def inline_images_referenced_by_path_relative_to_base_are_included_in_output():
with open(test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
assert_equal("""<p><img src="" /></p>""", result.value)
assert_equal("""<p><img height="10" src="" width="10" /></p>""", result.value)
assert_equal([], result.messages)


@istest
def images_stored_outside_of_document_are_included_in_output():
with open(test_path("external-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
assert_equal("""<p><img src="" /></p>""", result.value)
assert_equal("""<p><img height="10" src="" width="10" /></p>""", result.value)
assert_equal([], result.messages)


Expand Down Expand Up @@ -173,7 +173,7 @@ def convert_image(image):

with open(test_path("tiny-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image)
assert_equal("""<p><img src="iV,image/png" /></p>""", result.value)
assert_equal("""<p><img height="10" src="iV,image/png" width="10" /></p>""", result.value)
assert_equal([], result.messages)


Expand Down