mwilliamson · antoinearbouin · Jul 20, 2020 · Jul 20, 2020 · Jul 20, 2020 · Jul 21, 2020
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 /.tox
 /MANIFEST
 /build
+*.iml
diff --git a/mammoth/documents.py b/mammoth/documents.py
@@ -82,6 +82,11 @@ class TableCell(HasChildren):
 class Break(Element):
     break_type = cobble.field()
 
+@cobble.data
+class Size(object):
+    width = cobble.field()
+    height = cobble.field()
+
 line_break = Break("line")
 page_break = Break("page")
 column_break = Break("column")
@@ -97,6 +102,7 @@ class Image(Element):
     alt_text = cobble.field()
     content_type = cobble.field()
     open = cobble.field()
+    size = cobble.field(default=None)
 
 
 def document(children, notes=None, comments=None):

diff --git a/mammoth/docx/body_xml.py b/mammoth/docx/body_xml.py
@@ -11,6 +11,8 @@
 from .styles_xml import Styles
 from .uris import replace_fragment, uri_to_zip_entry_name
 
+EMU_PER_PIXEL = 9525
+
 if sys.version_info >= (3, ):
     unichr = chr
 
@@ -423,23 +425,32 @@ def inline(element):
             alt_text = properties.get("descr")
         else:
             alt_text = properties.get("title")
+        dimensions = element.find_child_or_null("wp:extent").attributes
+        size = documents.Size(
+            width=str(_emu_to_pixel(dimensions.get("cx"))),
+            height=str(_emu_to_pixel(dimensions.get("cy")))
+        )
+
         blips = element.find_children("a:graphic") \
             .find_children("a:graphicData") \
             .find_children("pic:pic") \
             .find_children("pic:blipFill") \
             .find_children("a:blip")
-        return _read_blips(blips, alt_text)
+        return _read_blips(blips, alt_text, size)
 
-    def _read_blips(blips, alt_text):
-        return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
+    def _emu_to_pixel(emu):
+        return int(round(float(emu) / EMU_PER_PIXEL))
 
-    def _read_blip(element, alt_text):
-        return _read_image(lambda: _find_blip_image(element), alt_text)
+    def _read_blips(blips, alt_text, size):
+        return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text, size), blips))
 
-    def _read_image(find_image, alt_text):
+    def _read_blip(element, alt_text, size):
+        return _read_image(lambda: _find_blip_image(element), alt_text, size)
+
+    def _read_image(find_image, alt_text, size=None):
         image_path, open_image = find_image()
         content_type = content_types.find_content_type(image_path)
-        image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
+        image = documents.image(alt_text=alt_text, content_type=content_type, size=size, open=open_image)
 
         if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
             messages = []
@@ -478,14 +489,37 @@ def open_image():
 
         return image_path, open_image
 
-    def read_imagedata(element):
+    def shape(element):
+        if len(element.children) == 1:
+            imagedata = element.find_child("v:imagedata")
+            if imagedata:
+                size = _read_shape_size(element)
+                return read_imagedata(imagedata, size)
+        return read_child_elements(element)
+
+    def _read_shape_size(element):
+        style_attribute = element.attributes.get("style")
+        if not style_attribute:
+            return None
+        style = style_attribute.split(";")
+        width = _extract_size_from_style("width", style)
+        height = _extract_size_from_style("height", style)
+        size = documents.Size(width=width, height=height)
+        return size
+
+    def _extract_size_from_style(style_name, style):
+        with_column = "{}:".format(style_name)
+        raw_size = next(iter(filter(lambda s: s.startswith(with_column), style)))
+        return raw_size.replace(with_column, "")
+
+    def read_imagedata(element, style=None):
         relationship_id = element.attributes.get("r:id")
         if relationship_id is None:
             warning = results.warning("A v:imagedata element without a relationship ID was ignored")
             return _empty_result_with_message(warning)
         else:
             title = element.attributes.get("o:title")
-            return _read_image(lambda: _find_embedded_image(relationship_id), title)
+            return _read_image(lambda: _find_embedded_image(relationship_id), title, style)
 
     def note_reference_reader(note_type):
         def note_reference(element):
@@ -522,7 +556,7 @@ def read_sdt(element):
         "v:group": read_child_elements,
         "v:rect": read_child_elements,
         "v:roundrect": read_child_elements,
-        "v:shape": read_child_elements,
+        "v:shape": shape,
         "v:textbox": read_child_elements,
         "w:txbxContent": read_child_elements,
         "w:pict": pict,

diff --git a/mammoth/images.py b/mammoth/images.py
@@ -8,6 +8,9 @@ def convert_image(image):
         attributes = func(image).copy()
         if image.alt_text:
             attributes["alt"] = image.alt_text
+        if image.size:
+            attributes["width"] = image.size.width
+            attributes["height"] = image.size.height
 
         return [html.element("img", attributes)]
 

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@ def read(fname):
     name='mammoth',
     version='1.4.15',
     description='Convert Word documents from docx to simple and clean HTML and Markdown',
-    long_description=read("README"),
+    long_description=read("README.md"),
     author='Michael Williamson',
     author_email='[email protected]',
     url='http://github.com/mwilliamson/python-mammoth',

diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -36,7 +36,7 @@ def html_is_written_to_file_if_output_file_is_set():
 def inline_images_are_included_in_output_if_writing_to_single_file():
     docx_path = test_path("tiny-picture.docx")
     result = _local.run(["mammoth", docx_path])
-    assert_equal(b"""<p><img src="data:image/png;base64,""" + _image_base_64 + b"""" /></p>""", result.output)
+    assert_equal(b"""<p><img height="10" src="data:image/png;base64,""" + _image_base_64 + b"""" width="10" /></p>""", result.output)
 
 
 @istest
@@ -50,7 +50,7 @@ def images_are_written_to_separate_files_if_output_dir_is_set():
         assert_equal(b"", result.stderr_output)
         assert_equal(b"", result.output)
         with open(output_path) as output_file:
-            assert_equal("""<p><img src="1.png" /></p>""", output_file.read())
+            assert_equal("""<p><img height="10" src="1.png" width="10" /></p>""", output_file.read())
 
         with open(image_path, "rb") as image_file:
             assert_equal(_image_base_64, base64.b64encode(image_file.read()))

diff --git a/tests/conversion_tests.py b/tests/conversion_tests.py
@@ -529,6 +529,19 @@ def images_have_alt_tags_if_available():
     image_html = parse_xml(io.StringIO(result.value))
     assert_equal('It\'s a hat', image_html.attributes["alt"])
 
+@istest
+def images_have_width_and_height_tags_if_available():
+    image = documents.image(
+        alt_text=None,
+        content_type="image/png",
+        size=documents.Size(width="42", height="51"),
+        open=lambda: io.BytesIO(b"abc")
+    )
+    result = convert_document_element_to_html(image)
+    image_html = parse_xml(io.StringIO(result.value))
+    assert_equal('42', image_html.attributes["width"])
+    assert_equal('51', image_html.attributes["height"])
+
 
 @istest
 def can_define_custom_conversion_for_images():

diff --git a/tests/docx/body_xml_tests.py b/tests/docx/body_xml_tests.py
@@ -4,7 +4,7 @@
 import sys
 
 from precisely import assert_that, is_sequence
-from nose.tools import istest, assert_equal
+from nose.tools import istest, assert_equal, assert_is_none
 from nose_parameterized import parameterized, param
 import funk
 
@@ -961,39 +961,92 @@ class ImageTests(object):
     IMAGE_RELATIONSHIP_ID = "rId5"
 
     def _read_embedded_image(self, element):
+        return self._read_embedded_images(element)[0]
+
+    def _read_embedded_images(self, element):
         relationships = Relationships([
             _image_relationship(self.IMAGE_RELATIONSHIP_ID, "media/hat.png"),
         ])
-
         mocks = funk.Mocks()
         docx_file = mocks.mock()
         funk.allows(docx_file).open("word/media/hat.png").returns(io.BytesIO(self.IMAGE_BYTES))
-
         content_types = mocks.mock()
         funk.allows(content_types).find_content_type("word/media/hat.png").returns("image/png")
-
-        return _read_and_get_document_xml_element(
+        return _read_and_get_document_xml_elements(
             element,
             content_types=content_types,
             relationships=relationships,
             docx_file=docx_file,
         )
 
     @istest
-    def can_read_imagedata_elements_with_rid_attribute(self):
-        imagedata_element = xml_element("v:imagedata", {
-            "r:id": self.IMAGE_RELATIONSHIP_ID,
-            "o:title": "It's a hat"
-        })
+    def can_read_shape_elements_with_rid_and_size_attributes(self):
+        shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
+            xml_element("v:imagedata", {
+                "r:id": self.IMAGE_RELATIONSHIP_ID,
+                "o:title": "It's a hat"
+            })
+        ])
 
-        image = self._read_embedded_image(imagedata_element)
+        image = self._read_embedded_image(shape_element)
 
         assert_equal(documents.Image, type(image))
         assert_equal("It's a hat", image.alt_text)
         assert_equal("image/png", image.content_type)
+        assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
         with image.open() as image_file:
             assert_equal(self.IMAGE_BYTES, image_file.read())
 
+    @istest
+    def cannot_resize_shape_with_multiple_nodes(self):
+        shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
+            xml_element("v:imagedata", {
+                "r:id": self.IMAGE_RELATIONSHIP_ID,
+                "o:title": "It's a hat"
+            }),
+            xml_element("v:textbox", {}, [
+                xml_element("w:txbxContent", {}, [
+                    _paragraph_with_style_id("textbox-content")
+                ])
+            ])
+        ])
+
+        nodes = self._read_embedded_images(shape_element)
+
+        assert_equal(2, len(nodes))
+        image_node = nodes[0]
+        assert_equal(documents.Image, type(image_node))
+        assert_equal("It's a hat", image_node.alt_text)
+        assert_is_none(image_node.size)
+
+    @istest
+    def can_read_shape_elements_with_unused_style_elements(self):
+        shape_element = xml_element("v:shape", {"style": "width:31.5pt;position:absolute;height:38.25pt"}, [
+            xml_element("v:imagedata", {
+                "r:id": self.IMAGE_RELATIONSHIP_ID,
+                "o:title": "It's a hat"
+            })
+        ])
+
+        image = self._read_embedded_image(shape_element)
+
+        assert_equal(documents.Image, type(image))
+        assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
+
+    @istest
+    def can_read_shape_elements_with_inch_size_attributes(self):
+        shape_element = xml_element("v:shape", {"style": "width:0.58in;height:0.708in"}, [
+            xml_element("v:imagedata", {
+                "r:id": self.IMAGE_RELATIONSHIP_ID,
+                "o:title": "It's a hat"
+            })
+        ])
+
+        image = self._read_embedded_image(shape_element)
+
+        assert_equal(documents.Image, type(image))
+        assert_equal(documents.Size(width="0.58in", height="0.708in"), image.size)
+
     @istest
     def when_imagedata_element_has_no_relationship_id_then_it_is_ignored_with_warning(self):
         imagedata_element = xml_element("v:imagedata")
@@ -1009,13 +1062,15 @@ def can_read_inline_pictures(self):
         drawing_element = _create_inline_image(
             blip=_embedded_blip(self.IMAGE_RELATIONSHIP_ID),
             description="It's a hat",
+            extent=(9525, 19000)
         )
 
         image = self._read_embedded_image(drawing_element)
 
         assert_equal(documents.Image, type(image))
         assert_equal("It's a hat", image.alt_text)
         assert_equal("image/png", image.content_type)
+        assert_equal(documents.Size(width="1", height="2"), image.size)
         with image.open() as image_file:
             assert_equal(self.IMAGE_BYTES, image_file.read())
 
@@ -1307,9 +1362,9 @@ def _text_element(value):
     return xml_element("w:t", {}, [xml_text(value)])
 
 
-def _create_inline_image(blip, description=None, title=None):
+def _create_inline_image(blip, description=None, title=None, extent=None):
     return xml_element("w:drawing", {}, [
-        xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title))
+        xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title, extent=extent))
     ])
 
 
@@ -1319,15 +1374,19 @@ def _create_anchored_image(description, blip):
     ])
 
 
-def _create_image_elements(blip, description=None, title=None):
+def _create_image_elements(blip, description=None, title=None, extent=None):
     properties = {}
     if description is not None:
         properties["descr"] = description
     if title is not None:
         properties["title"] = title
-
+    extent = {
+        "cx": extent[0] if extent else "0",
+        "cy": extent[1] if extent else "0"
+    }
     return [
         xml_element("wp:docPr", properties),
+        xml_element("wp:extent", extent),
         xml_element("a:graphic", {}, [
             xml_element("a:graphicData", {}, [
                 xml_element("pic:pic", {}, [

diff --git a/tests/images_tests.py b/tests/images_tests.py
@@ -17,11 +17,16 @@ def data_uri_encodes_images_in_base64():
     image = mammoth.documents.Image(
         alt_text=None,
         content_type="image/jpeg",
+        size=mammoth.documents.Size(width="800", height="600"),
         open=lambda: io.BytesIO(image_bytes),
     )
 
     result = mammoth.images.data_uri(image)
 
     assert_that(result, contains(
-        has_properties(attributes={"src": "data:image/jpeg;base64,YWJj"}),
+        has_properties(attributes={
+            "src": "data:image/jpeg;base64,YWJj",
+            "width": "800",
+            "height": "600",
+        }),
     ))
diff --git a/tests/mammoth_tests.py b/tests/mammoth_tests.py
@@ -112,23 +112,23 @@ def warning_if_style_mapping_is_not_understood():
 def inline_images_referenced_by_path_relative_to_part_are_included_in_output():
     with open(test_path("tiny-picture.docx"), "rb") as fileobj:
         result = mammoth.convert_to_html(fileobj=fileobj)
-        assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
+        assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
         assert_equal([], result.messages)
 
 
 @istest
 def inline_images_referenced_by_path_relative_to_base_are_included_in_output():
     with open(test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj:
         result = mammoth.convert_to_html(fileobj=fileobj)
-        assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
+        assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
         assert_equal([], result.messages)
 
 
 @istest
 def images_stored_outside_of_document_are_included_in_output():
     with open(test_path("external-picture.docx"), "rb") as fileobj:
         result = mammoth.convert_to_html(fileobj=fileobj)
-        assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
+        assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
         assert_equal([], result.messages)
 
 
@@ -173,7 +173,7 @@ def convert_image(image):
 
     with open(test_path("tiny-picture.docx"), "rb") as fileobj:
         result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image)
-        assert_equal("""<p><img src="iV,image/png" /></p>""", result.value)
+        assert_equal("""<p><img height="10" src="iV,image/png" width="10" /></p>""", result.value)
         assert_equal([], result.messages)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ @@
     /.tox
     /MANIFEST
     /build
+    *.iml