diff --git a/.gitignore b/.gitignore index 3c7c12ac..baebf8dd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /.tox /MANIFEST /build +*.iml diff --git a/mammoth/documents.py b/mammoth/documents.py index 7559d406..17936bfd 100644 --- a/mammoth/documents.py +++ b/mammoth/documents.py @@ -82,6 +82,11 @@ class TableCell(HasChildren): class Break(Element): break_type = cobble.field() +@cobble.data +class Size(object): + width = cobble.field() + height = cobble.field() + line_break = Break("line") page_break = Break("page") column_break = Break("column") @@ -97,6 +102,7 @@ class Image(Element): alt_text = cobble.field() content_type = cobble.field() open = cobble.field() + size = cobble.field(default=None) def document(children, notes=None, comments=None): diff --git a/mammoth/docx/body_xml.py b/mammoth/docx/body_xml.py index 8329bcfc..7b4e05e7 100644 --- a/mammoth/docx/body_xml.py +++ b/mammoth/docx/body_xml.py @@ -11,6 +11,8 @@ from .styles_xml import Styles from .uris import replace_fragment, uri_to_zip_entry_name +EMU_PER_PIXEL = 9525 + if sys.version_info >= (3, ): unichr = chr @@ -423,23 +425,32 @@ def inline(element): alt_text = properties.get("descr") else: alt_text = properties.get("title") + dimensions = element.find_child_or_null("wp:extent").attributes + size = documents.Size( + width=str(_emu_to_pixel(dimensions.get("cx"))), + height=str(_emu_to_pixel(dimensions.get("cy"))) + ) + blips = element.find_children("a:graphic") \ .find_children("a:graphicData") \ .find_children("pic:pic") \ .find_children("pic:blipFill") \ .find_children("a:blip") - return _read_blips(blips, alt_text) + return _read_blips(blips, alt_text, size) - def _read_blips(blips, alt_text): - return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips)) + def _emu_to_pixel(emu): + return int(round(float(emu) / EMU_PER_PIXEL)) - def _read_blip(element, alt_text): - return _read_image(lambda: _find_blip_image(element), alt_text) + def _read_blips(blips, alt_text, size): + return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text, size), blips)) - def _read_image(find_image, alt_text): + def _read_blip(element, alt_text, size): + return _read_image(lambda: _find_blip_image(element), alt_text, size) + + def _read_image(find_image, alt_text, size=None): image_path, open_image = find_image() content_type = content_types.find_content_type(image_path) - image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image) + image = documents.image(alt_text=alt_text, content_type=content_type, size=size, open=open_image) if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]: messages = [] @@ -478,14 +489,37 @@ def open_image(): return image_path, open_image - def read_imagedata(element): + def shape(element): + if len(element.children) == 1: + imagedata = element.find_child("v:imagedata") + if imagedata: + size = _read_shape_size(element) + return read_imagedata(imagedata, size) + return read_child_elements(element) + + def _read_shape_size(element): + style_attribute = element.attributes.get("style") + if not style_attribute: + return None + style = style_attribute.split(";") + width = _extract_size_from_style("width", style) + height = _extract_size_from_style("height", style) + size = documents.Size(width=width, height=height) + return size + + def _extract_size_from_style(style_name, style): + with_column = "{}:".format(style_name) + raw_size = next(iter(filter(lambda s: s.startswith(with_column), style))) + return raw_size.replace(with_column, "") + + def read_imagedata(element, style=None): relationship_id = element.attributes.get("r:id") if relationship_id is None: warning = results.warning("A v:imagedata element without a relationship ID was ignored") return _empty_result_with_message(warning) else: title = element.attributes.get("o:title") - return _read_image(lambda: _find_embedded_image(relationship_id), title) + return _read_image(lambda: _find_embedded_image(relationship_id), title, style) def note_reference_reader(note_type): def note_reference(element): @@ -522,7 +556,7 @@ def read_sdt(element): "v:group": read_child_elements, "v:rect": read_child_elements, "v:roundrect": read_child_elements, - "v:shape": read_child_elements, + "v:shape": shape, "v:textbox": read_child_elements, "w:txbxContent": read_child_elements, "w:pict": pict, diff --git a/mammoth/images.py b/mammoth/images.py index 9dba353f..89770803 100644 --- a/mammoth/images.py +++ b/mammoth/images.py @@ -8,6 +8,9 @@ def convert_image(image): attributes = func(image).copy() if image.alt_text: attributes["alt"] = image.alt_text + if image.size: + attributes["width"] = image.size.width + attributes["height"] = image.size.height return [html.element("img", attributes)] diff --git a/setup.py b/setup.py index 7da0d14d..751686f2 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ def read(fname): name='mammoth', version='1.4.15', description='Convert Word documents from docx to simple and clean HTML and Markdown', - long_description=read("README"), + long_description=read("README.md"), author='Michael Williamson', author_email='mike@zwobble.org', url='http://github.com/mwilliamson/python-mammoth', diff --git a/tests/cli_tests.py b/tests/cli_tests.py index 4728c6b0..82c2ef01 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -36,7 +36,7 @@ def html_is_written_to_file_if_output_file_is_set(): def inline_images_are_included_in_output_if_writing_to_single_file(): docx_path = test_path("tiny-picture.docx") result = _local.run(["mammoth", docx_path]) - assert_equal(b"""

""", result.output) + assert_equal(b"""

""", result.output) @istest @@ -50,7 +50,7 @@ def images_are_written_to_separate_files_if_output_dir_is_set(): assert_equal(b"", result.stderr_output) assert_equal(b"", result.output) with open(output_path) as output_file: - assert_equal("""

""", output_file.read()) + assert_equal("""

""", output_file.read()) with open(image_path, "rb") as image_file: assert_equal(_image_base_64, base64.b64encode(image_file.read())) diff --git a/tests/conversion_tests.py b/tests/conversion_tests.py index c4e667f3..49752380 100644 --- a/tests/conversion_tests.py +++ b/tests/conversion_tests.py @@ -529,6 +529,19 @@ def images_have_alt_tags_if_available(): image_html = parse_xml(io.StringIO(result.value)) assert_equal('It\'s a hat', image_html.attributes["alt"]) +@istest +def images_have_width_and_height_tags_if_available(): + image = documents.image( + alt_text=None, + content_type="image/png", + size=documents.Size(width="42", height="51"), + open=lambda: io.BytesIO(b"abc") + ) + result = convert_document_element_to_html(image) + image_html = parse_xml(io.StringIO(result.value)) + assert_equal('42', image_html.attributes["width"]) + assert_equal('51', image_html.attributes["height"]) + @istest def can_define_custom_conversion_for_images(): diff --git a/tests/docx/body_xml_tests.py b/tests/docx/body_xml_tests.py index 94061fda..37e0d425 100644 --- a/tests/docx/body_xml_tests.py +++ b/tests/docx/body_xml_tests.py @@ -4,7 +4,7 @@ import sys from precisely import assert_that, is_sequence -from nose.tools import istest, assert_equal +from nose.tools import istest, assert_equal, assert_is_none from nose_parameterized import parameterized, param import funk @@ -961,18 +961,18 @@ class ImageTests(object): IMAGE_RELATIONSHIP_ID = "rId5" def _read_embedded_image(self, element): + return self._read_embedded_images(element)[0] + + def _read_embedded_images(self, element): relationships = Relationships([ _image_relationship(self.IMAGE_RELATIONSHIP_ID, "media/hat.png"), ]) - mocks = funk.Mocks() docx_file = mocks.mock() funk.allows(docx_file).open("word/media/hat.png").returns(io.BytesIO(self.IMAGE_BYTES)) - content_types = mocks.mock() funk.allows(content_types).find_content_type("word/media/hat.png").returns("image/png") - - return _read_and_get_document_xml_element( + return _read_and_get_document_xml_elements( element, content_types=content_types, relationships=relationships, @@ -980,20 +980,73 @@ def _read_embedded_image(self, element): ) @istest - def can_read_imagedata_elements_with_rid_attribute(self): - imagedata_element = xml_element("v:imagedata", { - "r:id": self.IMAGE_RELATIONSHIP_ID, - "o:title": "It's a hat" - }) + def can_read_shape_elements_with_rid_and_size_attributes(self): + shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [ + xml_element("v:imagedata", { + "r:id": self.IMAGE_RELATIONSHIP_ID, + "o:title": "It's a hat" + }) + ]) - image = self._read_embedded_image(imagedata_element) + image = self._read_embedded_image(shape_element) assert_equal(documents.Image, type(image)) assert_equal("It's a hat", image.alt_text) assert_equal("image/png", image.content_type) + assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size) with image.open() as image_file: assert_equal(self.IMAGE_BYTES, image_file.read()) + @istest + def cannot_resize_shape_with_multiple_nodes(self): + shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [ + xml_element("v:imagedata", { + "r:id": self.IMAGE_RELATIONSHIP_ID, + "o:title": "It's a hat" + }), + xml_element("v:textbox", {}, [ + xml_element("w:txbxContent", {}, [ + _paragraph_with_style_id("textbox-content") + ]) + ]) + ]) + + nodes = self._read_embedded_images(shape_element) + + assert_equal(2, len(nodes)) + image_node = nodes[0] + assert_equal(documents.Image, type(image_node)) + assert_equal("It's a hat", image_node.alt_text) + assert_is_none(image_node.size) + + @istest + def can_read_shape_elements_with_unused_style_elements(self): + shape_element = xml_element("v:shape", {"style": "width:31.5pt;position:absolute;height:38.25pt"}, [ + xml_element("v:imagedata", { + "r:id": self.IMAGE_RELATIONSHIP_ID, + "o:title": "It's a hat" + }) + ]) + + image = self._read_embedded_image(shape_element) + + assert_equal(documents.Image, type(image)) + assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size) + + @istest + def can_read_shape_elements_with_inch_size_attributes(self): + shape_element = xml_element("v:shape", {"style": "width:0.58in;height:0.708in"}, [ + xml_element("v:imagedata", { + "r:id": self.IMAGE_RELATIONSHIP_ID, + "o:title": "It's a hat" + }) + ]) + + image = self._read_embedded_image(shape_element) + + assert_equal(documents.Image, type(image)) + assert_equal(documents.Size(width="0.58in", height="0.708in"), image.size) + @istest def when_imagedata_element_has_no_relationship_id_then_it_is_ignored_with_warning(self): imagedata_element = xml_element("v:imagedata") @@ -1009,6 +1062,7 @@ def can_read_inline_pictures(self): drawing_element = _create_inline_image( blip=_embedded_blip(self.IMAGE_RELATIONSHIP_ID), description="It's a hat", + extent=(9525, 19000) ) image = self._read_embedded_image(drawing_element) @@ -1016,6 +1070,7 @@ def can_read_inline_pictures(self): assert_equal(documents.Image, type(image)) assert_equal("It's a hat", image.alt_text) assert_equal("image/png", image.content_type) + assert_equal(documents.Size(width="1", height="2"), image.size) with image.open() as image_file: assert_equal(self.IMAGE_BYTES, image_file.read()) @@ -1307,9 +1362,9 @@ def _text_element(value): return xml_element("w:t", {}, [xml_text(value)]) -def _create_inline_image(blip, description=None, title=None): +def _create_inline_image(blip, description=None, title=None, extent=None): return xml_element("w:drawing", {}, [ - xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title)) + xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title, extent=extent)) ]) @@ -1319,15 +1374,19 @@ def _create_anchored_image(description, blip): ]) -def _create_image_elements(blip, description=None, title=None): +def _create_image_elements(blip, description=None, title=None, extent=None): properties = {} if description is not None: properties["descr"] = description if title is not None: properties["title"] = title - + extent = { + "cx": extent[0] if extent else "0", + "cy": extent[1] if extent else "0" + } return [ xml_element("wp:docPr", properties), + xml_element("wp:extent", extent), xml_element("a:graphic", {}, [ xml_element("a:graphicData", {}, [ xml_element("pic:pic", {}, [ diff --git a/tests/images_tests.py b/tests/images_tests.py index cf2e2844..9962d252 100644 --- a/tests/images_tests.py +++ b/tests/images_tests.py @@ -17,11 +17,16 @@ def data_uri_encodes_images_in_base64(): image = mammoth.documents.Image( alt_text=None, content_type="image/jpeg", + size=mammoth.documents.Size(width="800", height="600"), open=lambda: io.BytesIO(image_bytes), ) result = mammoth.images.data_uri(image) assert_that(result, contains( - has_properties(attributes={"src": "data:image/jpeg;base64,YWJj"}), + has_properties(attributes={ + "src": "data:image/jpeg;base64,YWJj", + "width": "800", + "height": "600", + }), )) diff --git a/tests/mammoth_tests.py b/tests/mammoth_tests.py index 1dcbaa9c..175c6cc8 100644 --- a/tests/mammoth_tests.py +++ b/tests/mammoth_tests.py @@ -112,7 +112,7 @@ def warning_if_style_mapping_is_not_understood(): def inline_images_referenced_by_path_relative_to_part_are_included_in_output(): with open(test_path("tiny-picture.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) - assert_equal("""

""", result.value) + assert_equal("""

""", result.value) assert_equal([], result.messages) @@ -120,7 +120,7 @@ def inline_images_referenced_by_path_relative_to_part_are_included_in_output(): def inline_images_referenced_by_path_relative_to_base_are_included_in_output(): with open(test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) - assert_equal("""

""", result.value) + assert_equal("""

""", result.value) assert_equal([], result.messages) @@ -128,7 +128,7 @@ def inline_images_referenced_by_path_relative_to_base_are_included_in_output(): def images_stored_outside_of_document_are_included_in_output(): with open(test_path("external-picture.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj) - assert_equal("""

""", result.value) + assert_equal("""

""", result.value) assert_equal([], result.messages) @@ -173,7 +173,7 @@ def convert_image(image): with open(test_path("tiny-picture.docx"), "rb") as fileobj: result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image) - assert_equal("""

""", result.value) + assert_equal("""

""", result.value) assert_equal([], result.messages)