diff --git a/.gitignore b/.gitignore
index 3c7c12ac..baebf8dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
/.tox
/MANIFEST
/build
+*.iml
diff --git a/mammoth/documents.py b/mammoth/documents.py
index 7559d406..17936bfd 100644
--- a/mammoth/documents.py
+++ b/mammoth/documents.py
@@ -82,6 +82,11 @@ class TableCell(HasChildren):
class Break(Element):
break_type = cobble.field()
+@cobble.data
+class Size(object):
+ width = cobble.field()
+ height = cobble.field()
+
line_break = Break("line")
page_break = Break("page")
column_break = Break("column")
@@ -97,6 +102,7 @@ class Image(Element):
alt_text = cobble.field()
content_type = cobble.field()
open = cobble.field()
+ size = cobble.field(default=None)
def document(children, notes=None, comments=None):
diff --git a/mammoth/docx/body_xml.py b/mammoth/docx/body_xml.py
index 8329bcfc..7b4e05e7 100644
--- a/mammoth/docx/body_xml.py
+++ b/mammoth/docx/body_xml.py
@@ -11,6 +11,8 @@
from .styles_xml import Styles
from .uris import replace_fragment, uri_to_zip_entry_name
+EMU_PER_PIXEL = 9525
+
if sys.version_info >= (3, ):
unichr = chr
@@ -423,23 +425,32 @@ def inline(element):
alt_text = properties.get("descr")
else:
alt_text = properties.get("title")
+ dimensions = element.find_child_or_null("wp:extent").attributes
+ size = documents.Size(
+ width=str(_emu_to_pixel(dimensions.get("cx"))),
+ height=str(_emu_to_pixel(dimensions.get("cy")))
+ )
+
blips = element.find_children("a:graphic") \
.find_children("a:graphicData") \
.find_children("pic:pic") \
.find_children("pic:blipFill") \
.find_children("a:blip")
- return _read_blips(blips, alt_text)
+ return _read_blips(blips, alt_text, size)
- def _read_blips(blips, alt_text):
- return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
+ def _emu_to_pixel(emu):
+ return int(round(float(emu) / EMU_PER_PIXEL))
- def _read_blip(element, alt_text):
- return _read_image(lambda: _find_blip_image(element), alt_text)
+ def _read_blips(blips, alt_text, size):
+ return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text, size), blips))
- def _read_image(find_image, alt_text):
+ def _read_blip(element, alt_text, size):
+ return _read_image(lambda: _find_blip_image(element), alt_text, size)
+
+ def _read_image(find_image, alt_text, size=None):
image_path, open_image = find_image()
content_type = content_types.find_content_type(image_path)
- image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
+ image = documents.image(alt_text=alt_text, content_type=content_type, size=size, open=open_image)
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
messages = []
@@ -478,14 +489,37 @@ def open_image():
return image_path, open_image
- def read_imagedata(element):
+ def shape(element):
+ if len(element.children) == 1:
+ imagedata = element.find_child("v:imagedata")
+ if imagedata:
+ size = _read_shape_size(element)
+ return read_imagedata(imagedata, size)
+ return read_child_elements(element)
+
+ def _read_shape_size(element):
+ style_attribute = element.attributes.get("style")
+ if not style_attribute:
+ return None
+ style = style_attribute.split(";")
+ width = _extract_size_from_style("width", style)
+ height = _extract_size_from_style("height", style)
+ size = documents.Size(width=width, height=height)
+ return size
+
+ def _extract_size_from_style(style_name, style):
+ with_column = "{}:".format(style_name)
+ raw_size = next(iter(filter(lambda s: s.startswith(with_column), style)))
+ return raw_size.replace(with_column, "")
+
+ def read_imagedata(element, style=None):
relationship_id = element.attributes.get("r:id")
if relationship_id is None:
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
return _empty_result_with_message(warning)
else:
title = element.attributes.get("o:title")
- return _read_image(lambda: _find_embedded_image(relationship_id), title)
+ return _read_image(lambda: _find_embedded_image(relationship_id), title, style)
def note_reference_reader(note_type):
def note_reference(element):
@@ -522,7 +556,7 @@ def read_sdt(element):
"v:group": read_child_elements,
"v:rect": read_child_elements,
"v:roundrect": read_child_elements,
- "v:shape": read_child_elements,
+ "v:shape": shape,
"v:textbox": read_child_elements,
"w:txbxContent": read_child_elements,
"w:pict": pict,
diff --git a/mammoth/images.py b/mammoth/images.py
index 9dba353f..89770803 100644
--- a/mammoth/images.py
+++ b/mammoth/images.py
@@ -8,6 +8,9 @@ def convert_image(image):
attributes = func(image).copy()
if image.alt_text:
attributes["alt"] = image.alt_text
+ if image.size:
+ attributes["width"] = image.size.width
+ attributes["height"] = image.size.height
return [html.element("img", attributes)]
diff --git a/setup.py b/setup.py
index 7da0d14d..751686f2 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@ def read(fname):
name='mammoth',
version='1.4.15',
description='Convert Word documents from docx to simple and clean HTML and Markdown',
- long_description=read("README"),
+ long_description=read("README.md"),
author='Michael Williamson',
author_email='mike@zwobble.org',
url='http://github.com/mwilliamson/python-mammoth',
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index 4728c6b0..82c2ef01 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -36,7 +36,7 @@ def html_is_written_to_file_if_output_file_is_set():
def inline_images_are_included_in_output_if_writing_to_single_file():
docx_path = test_path("tiny-picture.docx")
result = _local.run(["mammoth", docx_path])
- assert_equal(b"""

""", result.output)
+ assert_equal(b"""
""", result.output)
@istest
@@ -50,7 +50,7 @@ def images_are_written_to_separate_files_if_output_dir_is_set():
assert_equal(b"", result.stderr_output)
assert_equal(b"", result.output)
with open(output_path) as output_file:
- assert_equal("""
""", output_file.read())
+ assert_equal("""
""", output_file.read())
with open(image_path, "rb") as image_file:
assert_equal(_image_base_64, base64.b64encode(image_file.read()))
diff --git a/tests/conversion_tests.py b/tests/conversion_tests.py
index c4e667f3..49752380 100644
--- a/tests/conversion_tests.py
+++ b/tests/conversion_tests.py
@@ -529,6 +529,19 @@ def images_have_alt_tags_if_available():
image_html = parse_xml(io.StringIO(result.value))
assert_equal('It\'s a hat', image_html.attributes["alt"])
+@istest
+def images_have_width_and_height_tags_if_available():
+ image = documents.image(
+ alt_text=None,
+ content_type="image/png",
+ size=documents.Size(width="42", height="51"),
+ open=lambda: io.BytesIO(b"abc")
+ )
+ result = convert_document_element_to_html(image)
+ image_html = parse_xml(io.StringIO(result.value))
+ assert_equal('42', image_html.attributes["width"])
+ assert_equal('51', image_html.attributes["height"])
+
@istest
def can_define_custom_conversion_for_images():
diff --git a/tests/docx/body_xml_tests.py b/tests/docx/body_xml_tests.py
index 94061fda..37e0d425 100644
--- a/tests/docx/body_xml_tests.py
+++ b/tests/docx/body_xml_tests.py
@@ -4,7 +4,7 @@
import sys
from precisely import assert_that, is_sequence
-from nose.tools import istest, assert_equal
+from nose.tools import istest, assert_equal, assert_is_none
from nose_parameterized import parameterized, param
import funk
@@ -961,18 +961,18 @@ class ImageTests(object):
IMAGE_RELATIONSHIP_ID = "rId5"
def _read_embedded_image(self, element):
+ return self._read_embedded_images(element)[0]
+
+ def _read_embedded_images(self, element):
relationships = Relationships([
_image_relationship(self.IMAGE_RELATIONSHIP_ID, "media/hat.png"),
])
-
mocks = funk.Mocks()
docx_file = mocks.mock()
funk.allows(docx_file).open("word/media/hat.png").returns(io.BytesIO(self.IMAGE_BYTES))
-
content_types = mocks.mock()
funk.allows(content_types).find_content_type("word/media/hat.png").returns("image/png")
-
- return _read_and_get_document_xml_element(
+ return _read_and_get_document_xml_elements(
element,
content_types=content_types,
relationships=relationships,
@@ -980,20 +980,73 @@ def _read_embedded_image(self, element):
)
@istest
- def can_read_imagedata_elements_with_rid_attribute(self):
- imagedata_element = xml_element("v:imagedata", {
- "r:id": self.IMAGE_RELATIONSHIP_ID,
- "o:title": "It's a hat"
- })
+ def can_read_shape_elements_with_rid_and_size_attributes(self):
+ shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
+ xml_element("v:imagedata", {
+ "r:id": self.IMAGE_RELATIONSHIP_ID,
+ "o:title": "It's a hat"
+ })
+ ])
- image = self._read_embedded_image(imagedata_element)
+ image = self._read_embedded_image(shape_element)
assert_equal(documents.Image, type(image))
assert_equal("It's a hat", image.alt_text)
assert_equal("image/png", image.content_type)
+ assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
with image.open() as image_file:
assert_equal(self.IMAGE_BYTES, image_file.read())
+ @istest
+ def cannot_resize_shape_with_multiple_nodes(self):
+ shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
+ xml_element("v:imagedata", {
+ "r:id": self.IMAGE_RELATIONSHIP_ID,
+ "o:title": "It's a hat"
+ }),
+ xml_element("v:textbox", {}, [
+ xml_element("w:txbxContent", {}, [
+ _paragraph_with_style_id("textbox-content")
+ ])
+ ])
+ ])
+
+ nodes = self._read_embedded_images(shape_element)
+
+ assert_equal(2, len(nodes))
+ image_node = nodes[0]
+ assert_equal(documents.Image, type(image_node))
+ assert_equal("It's a hat", image_node.alt_text)
+ assert_is_none(image_node.size)
+
+ @istest
+ def can_read_shape_elements_with_unused_style_elements(self):
+ shape_element = xml_element("v:shape", {"style": "width:31.5pt;position:absolute;height:38.25pt"}, [
+ xml_element("v:imagedata", {
+ "r:id": self.IMAGE_RELATIONSHIP_ID,
+ "o:title": "It's a hat"
+ })
+ ])
+
+ image = self._read_embedded_image(shape_element)
+
+ assert_equal(documents.Image, type(image))
+ assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
+
+ @istest
+ def can_read_shape_elements_with_inch_size_attributes(self):
+ shape_element = xml_element("v:shape", {"style": "width:0.58in;height:0.708in"}, [
+ xml_element("v:imagedata", {
+ "r:id": self.IMAGE_RELATIONSHIP_ID,
+ "o:title": "It's a hat"
+ })
+ ])
+
+ image = self._read_embedded_image(shape_element)
+
+ assert_equal(documents.Image, type(image))
+ assert_equal(documents.Size(width="0.58in", height="0.708in"), image.size)
+
@istest
def when_imagedata_element_has_no_relationship_id_then_it_is_ignored_with_warning(self):
imagedata_element = xml_element("v:imagedata")
@@ -1009,6 +1062,7 @@ def can_read_inline_pictures(self):
drawing_element = _create_inline_image(
blip=_embedded_blip(self.IMAGE_RELATIONSHIP_ID),
description="It's a hat",
+ extent=(9525, 19000)
)
image = self._read_embedded_image(drawing_element)
@@ -1016,6 +1070,7 @@ def can_read_inline_pictures(self):
assert_equal(documents.Image, type(image))
assert_equal("It's a hat", image.alt_text)
assert_equal("image/png", image.content_type)
+ assert_equal(documents.Size(width="1", height="2"), image.size)
with image.open() as image_file:
assert_equal(self.IMAGE_BYTES, image_file.read())
@@ -1307,9 +1362,9 @@ def _text_element(value):
return xml_element("w:t", {}, [xml_text(value)])
-def _create_inline_image(blip, description=None, title=None):
+def _create_inline_image(blip, description=None, title=None, extent=None):
return xml_element("w:drawing", {}, [
- xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title))
+ xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title, extent=extent))
])
@@ -1319,15 +1374,19 @@ def _create_anchored_image(description, blip):
])
-def _create_image_elements(blip, description=None, title=None):
+def _create_image_elements(blip, description=None, title=None, extent=None):
properties = {}
if description is not None:
properties["descr"] = description
if title is not None:
properties["title"] = title
-
+ extent = {
+ "cx": extent[0] if extent else "0",
+ "cy": extent[1] if extent else "0"
+ }
return [
xml_element("wp:docPr", properties),
+ xml_element("wp:extent", extent),
xml_element("a:graphic", {}, [
xml_element("a:graphicData", {}, [
xml_element("pic:pic", {}, [
diff --git a/tests/images_tests.py b/tests/images_tests.py
index cf2e2844..9962d252 100644
--- a/tests/images_tests.py
+++ b/tests/images_tests.py
@@ -17,11 +17,16 @@ def data_uri_encodes_images_in_base64():
image = mammoth.documents.Image(
alt_text=None,
content_type="image/jpeg",
+ size=mammoth.documents.Size(width="800", height="600"),
open=lambda: io.BytesIO(image_bytes),
)
result = mammoth.images.data_uri(image)
assert_that(result, contains(
- has_properties(attributes={"src": "data:image/jpeg;base64,YWJj"}),
+ has_properties(attributes={
+ "src": "data:image/jpeg;base64,YWJj",
+ "width": "800",
+ "height": "600",
+ }),
))
diff --git a/tests/mammoth_tests.py b/tests/mammoth_tests.py
index 1dcbaa9c..175c6cc8 100644
--- a/tests/mammoth_tests.py
+++ b/tests/mammoth_tests.py
@@ -112,7 +112,7 @@ def warning_if_style_mapping_is_not_understood():
def inline_images_referenced_by_path_relative_to_part_are_included_in_output():
with open(test_path("tiny-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
- assert_equal("""
""", result.value)
+ assert_equal("""
""", result.value)
assert_equal([], result.messages)
@@ -120,7 +120,7 @@ def inline_images_referenced_by_path_relative_to_part_are_included_in_output():
def inline_images_referenced_by_path_relative_to_base_are_included_in_output():
with open(test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
- assert_equal("""
""", result.value)
+ assert_equal("""
""", result.value)
assert_equal([], result.messages)
@@ -128,7 +128,7 @@ def inline_images_referenced_by_path_relative_to_base_are_included_in_output():
def images_stored_outside_of_document_are_included_in_output():
with open(test_path("external-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
- assert_equal("""
""", result.value)
+ assert_equal("""
""", result.value)
assert_equal([], result.messages)
@@ -173,7 +173,7 @@ def convert_image(image):
with open(test_path("tiny-picture.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image)
- assert_equal("""
""", result.value)
+ assert_equal("""
""", result.value)
assert_equal([], result.messages)