Skip to content

Commit 8723f4b

Browse files
committed
merge pr mwilliamson#93 on fresh master
1 parent d037ef5 commit 8723f4b

File tree

9 files changed

+152
-31
lines changed

9 files changed

+152
-31
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@
55
/.tox
66
/MANIFEST
77
/build
8+
*.iml

mammoth/documents.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ class TableCell(HasChildren):
8282
class Break(Element):
8383
break_type = cobble.field()
8484

85+
@cobble.data
86+
class Size(object):
87+
width = cobble.field()
88+
height = cobble.field()
89+
8590
line_break = Break("line")
8691
page_break = Break("page")
8792
column_break = Break("column")
@@ -97,6 +102,7 @@ class Image(Element):
97102
alt_text = cobble.field()
98103
content_type = cobble.field()
99104
open = cobble.field()
105+
size = cobble.field(default=None)
100106

101107

102108
def document(children, notes=None, comments=None):

mammoth/docx/body_xml.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
from .styles_xml import Styles
1212
from .uris import replace_fragment, uri_to_zip_entry_name
1313

14+
EMU_PER_PIXEL = 9525
15+
1416
if sys.version_info >= (3, ):
1517
unichr = chr
1618

@@ -446,29 +448,38 @@ def inline(element):
446448
alt_text = properties.get("descr")
447449
else:
448450
alt_text = properties.get("title")
451+
dimensions = element.find_child_or_null("wp:extent").attributes
452+
size = documents.Size(
453+
width=str(_emu_to_pixel(dimensions.get("cx"))),
454+
height=str(_emu_to_pixel(dimensions.get("cy")))
455+
)
456+
449457
blips = element.find_children("a:graphic") \
450458
.find_children("a:graphicData") \
451459
.find_children("pic:pic") \
452460
.find_children("pic:blipFill") \
453461
.find_children("a:blip")
454-
return _read_blips(blips, alt_text)
462+
return _read_blips(blips, alt_text, size)
455463

456-
def _read_blips(blips, alt_text):
457-
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text), blips))
464+
def _emu_to_pixel(emu):
465+
return int(round(float(emu) / EMU_PER_PIXEL))
458466

459-
def _read_blip(element, alt_text):
467+
def _read_blips(blips, alt_text, size):
468+
return _ReadResult.concat(lists.map(lambda blip: _read_blip(blip, alt_text, size), blips))
469+
470+
def _read_blip(element, alt_text, size):
460471
blip_image = _find_blip_image(element)
461472

462473
if blip_image is None:
463474
warning = results.warning("Could not find image file for a:blip element")
464475
return _empty_result_with_message(warning)
465476
else:
466-
return _read_image(blip_image, alt_text)
477+
return _read_image(blip_image, alt_text, size)
467478

468-
def _read_image(image_file, alt_text):
479+
def _read_image(image_file, alt_text, size=None):
469480
image_path, open_image = image_file
470481
content_type = content_types.find_content_type(image_path)
471-
image = documents.image(alt_text=alt_text, content_type=content_type, open=open_image)
482+
image = documents.image(alt_text=alt_text, content_type=content_type, size=size, open=open_image)
472483

473484
if content_type in ["image/png", "image/gif", "image/jpeg", "image/svg+xml", "image/tiff"]:
474485
messages = []
@@ -509,14 +520,37 @@ def open_image():
509520

510521
return image_path, open_image
511522

512-
def read_imagedata(element):
523+
def shape(element):
524+
if len(element.children) == 1:
525+
imagedata = element.find_child("v:imagedata")
526+
if imagedata:
527+
size = _read_shape_size(element)
528+
return read_imagedata(imagedata, size)
529+
return read_child_elements(element)
530+
531+
def _read_shape_size(element):
532+
style_attribute = element.attributes.get("style")
533+
if not style_attribute:
534+
return None
535+
style = style_attribute.split(";")
536+
width = _extract_size_from_style("width", style)
537+
height = _extract_size_from_style("height", style)
538+
size = documents.Size(width=width, height=height)
539+
return size
540+
541+
def _extract_size_from_style(style_name, style):
542+
with_column = "{}:".format(style_name)
543+
raw_size = next(iter(filter(lambda s: s.startswith(with_column), style)))
544+
return raw_size.replace(with_column, "")
545+
546+
def read_imagedata(element, style=None):
513547
relationship_id = element.attributes.get("r:id")
514548
if relationship_id is None:
515549
warning = results.warning("A v:imagedata element without a relationship ID was ignored")
516550
return _empty_result_with_message(warning)
517551
else:
518552
title = element.attributes.get("o:title")
519-
return _read_image(_find_embedded_image(relationship_id), title)
553+
return _read_image(_find_embedded_image(relationship_id), title, style)
520554

521555
def note_reference_reader(note_type):
522556
def note_reference(element):
@@ -553,7 +587,7 @@ def read_sdt(element):
553587
"v:group": read_child_elements,
554588
"v:rect": read_child_elements,
555589
"v:roundrect": read_child_elements,
556-
"v:shape": read_child_elements,
590+
"v:shape": shape,
557591
"v:textbox": read_child_elements,
558592
"w:txbxContent": read_child_elements,
559593
"w:pict": pict,

mammoth/images.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ def convert_image(image):
88
attributes = {}
99
if image.alt_text:
1010
attributes["alt"] = image.alt_text
11+
if image.size:
12+
attributes["width"] = image.size.width
13+
attributes["height"] = image.size.height
1114
attributes.update(func(image))
1215

1316
return [html.element("img", attributes)]

tests/cli_tests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_html_is_written_to_file_if_output_file_is_set():
3434
def test_inline_images_are_included_in_output_if_writing_to_single_file():
3535
docx_path = generate_test_path("tiny-picture.docx")
3636
result = _local.run(["mammoth", docx_path])
37-
assert_equal(b"""<p><img src="data:image/png;base64,""" + _image_base_64 + b"""" /></p>""", result.output)
37+
assert_equal(b"""<p><img height="10" src="data:image/png;base64,""" + _image_base_64 + b"""" width="10" /></p>""", result.output)
3838

3939

4040
def test_images_are_written_to_separate_files_if_output_dir_is_set():
@@ -47,7 +47,7 @@ def test_images_are_written_to_separate_files_if_output_dir_is_set():
4747
assert_equal(b"", result.stderr_output)
4848
assert_equal(b"", result.output)
4949
with open(output_path) as output_file:
50-
assert_equal("""<p><img src="1.png" /></p>""", output_file.read())
50+
assert_equal("""<p><img height="10" src="1.png" width="10" /></p>""", output_file.read())
5151

5252
with open(image_path, "rb") as image_file:
5353
assert_equal(_image_base_64, base64.b64encode(image_file.read()))

tests/conversion_tests.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,19 @@ def test_images_have_alt_tags_if_available():
481481
assert_equal('It\'s a hat', image_html.attributes["alt"])
482482

483483

484+
def images_have_width_and_height_tags_if_available():
485+
image = documents.image(
486+
alt_text=None,
487+
content_type="image/png",
488+
size=documents.Size(width="42", height="51"),
489+
open=lambda: io.BytesIO(b"abc")
490+
)
491+
result = convert_document_element_to_html(image)
492+
image_html = parse_xml(io.StringIO(result.value))
493+
assert_equal('42', image_html.attributes["width"])
494+
assert_equal('51', image_html.attributes["height"])
495+
496+
484497
def test_can_define_custom_conversion_for_images():
485498
def convert_image(image):
486499
with image.open() as image_file:

tests/docx/body_xml_tests.py

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -954,38 +954,91 @@ class ImageTests(object):
954954
IMAGE_RELATIONSHIP_ID = "rId5"
955955

956956
def _read_embedded_image(self, element):
957+
return self._read_embedded_images(element)[0]
958+
959+
def _read_embedded_images(self, element):
957960
relationships = Relationships([
958961
_image_relationship(self.IMAGE_RELATIONSHIP_ID, "media/hat.png"),
959962
])
960-
961963
mocks = funk.Mocks()
962964
docx_file = mocks.mock()
963965
funk.allows(docx_file).open("word/media/hat.png").returns(io.BytesIO(self.IMAGE_BYTES))
964-
965966
content_types = mocks.mock()
966967
funk.allows(content_types).find_content_type("word/media/hat.png").returns("image/png")
967-
968-
return _read_and_get_document_xml_element(
968+
return _read_and_get_document_xml_elements(
969969
element,
970970
content_types=content_types,
971971
relationships=relationships,
972972
docx_file=docx_file,
973973
)
974974

975-
def test_can_read_imagedata_elements_with_rid_attribute(self):
976-
imagedata_element = xml_element("v:imagedata", {
977-
"r:id": self.IMAGE_RELATIONSHIP_ID,
978-
"o:title": "It's a hat"
979-
})
975+
def can_read_shape_elements_with_rid_and_size_attributes(self):
976+
shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
977+
xml_element("v:imagedata", {
978+
"r:id": self.IMAGE_RELATIONSHIP_ID,
979+
"o:title": "It's a hat"
980+
})
981+
])
980982

981-
image = self._read_embedded_image(imagedata_element)
983+
image = self._read_embedded_image(shape_element)
982984

983985
assert_equal(documents.Image, type(image))
984986
assert_equal("It's a hat", image.alt_text)
985987
assert_equal("image/png", image.content_type)
988+
assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
986989
with image.open() as image_file:
987990
assert_equal(self.IMAGE_BYTES, image_file.read())
988991

992+
993+
def cannot_resize_shape_with_multiple_nodes(self):
994+
shape_element = xml_element("v:shape", {"style": "width:31.5pt;height:38.25pt"}, [
995+
xml_element("v:imagedata", {
996+
"r:id": self.IMAGE_RELATIONSHIP_ID,
997+
"o:title": "It's a hat"
998+
}),
999+
xml_element("v:textbox", {}, [
1000+
xml_element("w:txbxContent", {}, [
1001+
_paragraph_with_style_id("textbox-content")
1002+
])
1003+
])
1004+
])
1005+
1006+
nodes = self._read_embedded_images(shape_element)
1007+
1008+
assert_equal(2, len(nodes))
1009+
image_node = nodes[0]
1010+
assert_equal(documents.Image, type(image_node))
1011+
assert_equal("It's a hat", image_node.alt_text)
1012+
assert_equal(None, image_node.size)
1013+
1014+
1015+
def can_read_shape_elements_with_unused_style_elements(self):
1016+
shape_element = xml_element("v:shape", {"style": "width:31.5pt;position:absolute;height:38.25pt"}, [
1017+
xml_element("v:imagedata", {
1018+
"r:id": self.IMAGE_RELATIONSHIP_ID,
1019+
"o:title": "It's a hat"
1020+
})
1021+
])
1022+
1023+
image = self._read_embedded_image(shape_element)
1024+
1025+
assert_equal(documents.Image, type(image))
1026+
assert_equal(documents.Size(width="31.5pt", height="38.25pt"), image.size)
1027+
1028+
1029+
def can_read_shape_elements_with_inch_size_attributes(self):
1030+
shape_element = xml_element("v:shape", {"style": "width:0.58in;height:0.708in"}, [
1031+
xml_element("v:imagedata", {
1032+
"r:id": self.IMAGE_RELATIONSHIP_ID,
1033+
"o:title": "It's a hat"
1034+
})
1035+
])
1036+
1037+
image = self._read_embedded_image(shape_element)
1038+
1039+
assert_equal(documents.Image, type(image))
1040+
assert_equal(documents.Size(width="0.58in", height="0.708in"), image.size)
1041+
9891042
def test_when_imagedata_element_has_no_relationship_id_then_it_is_ignored_with_warning(self):
9901043
imagedata_element = xml_element("v:imagedata")
9911044

@@ -999,13 +1052,15 @@ def test_can_read_inline_pictures(self):
9991052
drawing_element = _create_inline_image(
10001053
blip=_embedded_blip(self.IMAGE_RELATIONSHIP_ID),
10011054
description="It's a hat",
1055+
extent=(9525, 19000)
10021056
)
10031057

10041058
image = self._read_embedded_image(drawing_element)
10051059

10061060
assert_equal(documents.Image, type(image))
10071061
assert_equal("It's a hat", image.alt_text)
10081062
assert_equal("image/png", image.content_type)
1063+
assert_equal(documents.Size(width="1", height="2"), image.size)
10091064
with image.open() as image_file:
10101065
assert_equal(self.IMAGE_BYTES, image_file.read())
10111066

@@ -1294,9 +1349,9 @@ def _text_element(value):
12941349
return xml_element("w:t", {}, [xml_text(value)])
12951350

12961351

1297-
def _create_inline_image(blip, description=None, title=None):
1352+
def _create_inline_image(blip, description=None, title=None, extent=None):
12981353
return xml_element("w:drawing", {}, [
1299-
xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title))
1354+
xml_element("wp:inline", {}, _create_image_elements(blip, description=description, title=title, extent=extent))
13001355
])
13011356

13021357

@@ -1306,15 +1361,19 @@ def _create_anchored_image(description, blip):
13061361
])
13071362

13081363

1309-
def _create_image_elements(blip, description=None, title=None):
1364+
def _create_image_elements(blip, description=None, title=None, extent=None):
13101365
properties = {}
13111366
if description is not None:
13121367
properties["descr"] = description
13131368
if title is not None:
13141369
properties["title"] = title
1315-
1370+
extent = {
1371+
"cx": extent[0] if extent else "0",
1372+
"cy": extent[1] if extent else "0"
1373+
}
13161374
return [
13171375
xml_element("wp:docPr", properties),
1376+
xml_element("wp:extent", extent),
13181377
xml_element("a:graphic", {}, [
13191378
xml_element("a:graphicData", {}, [
13201379
xml_element("pic:pic", {}, [

tests/images_tests.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,18 @@ def test_data_uri_encodes_images_in_base64():
1414
image = mammoth.documents.Image(
1515
alt_text=None,
1616
content_type="image/jpeg",
17+
size=mammoth.documents.Size(width="800", height="600"),
1718
open=lambda: io.BytesIO(image_bytes),
1819
)
1920

2021
result = mammoth.images.data_uri(image)
2122

2223
assert_that(result, is_sequence(
23-
has_attrs(attributes={"src": "data:image/jpeg;base64,YWJj"}),
24+
has_attrs(attributes={
25+
"src": "data:image/jpeg;base64,YWJj",
26+
"width": "800",
27+
"height": "600",
28+
}),
2429
))
2530

2631

tests/mammoth_tests.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,21 +99,21 @@ def test_warning_if_style_mapping_is_not_understood():
9999
def test_inline_images_referenced_by_path_relative_to_part_are_included_in_output():
100100
with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj:
101101
result = mammoth.convert_to_html(fileobj=fileobj)
102-
assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
102+
assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
103103
assert_equal([], result.messages)
104104

105105

106106
def test_inline_images_referenced_by_path_relative_to_base_are_included_in_output():
107107
with open(generate_test_path("tiny-picture-target-base-relative.docx"), "rb") as fileobj:
108108
result = mammoth.convert_to_html(fileobj=fileobj)
109-
assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
109+
assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
110110
assert_equal([], result.messages)
111111

112112

113113
def test_images_stored_outside_of_document_are_included_in_output():
114114
with open(generate_test_path("external-picture.docx"), "rb") as fileobj:
115115
result = mammoth.convert_to_html(fileobj=fileobj)
116-
assert_equal("""<p><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" /></p>""", result.value)
116+
assert_equal("""<p><img height="10" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAAAAXNSR0IArs4c6QAAAAlwSFlzAAAOvgAADr4B6kKxwAAAABNJREFUKFNj/M+ADzDhlWUYqdIAQSwBE8U+X40AAAAASUVORK5CYII=" width="10" /></p>""", result.value)
117117
assert_equal([], result.messages)
118118

119119

@@ -155,7 +155,7 @@ def convert_image(image):
155155

156156
with open(generate_test_path("tiny-picture.docx"), "rb") as fileobj:
157157
result = mammoth.convert_to_html(fileobj=fileobj, convert_image=convert_image)
158-
assert_equal("""<p><img src="iV,image/png" /></p>""", result.value)
158+
assert_equal("""<p><img height="10" src="iV,image/png" width="10" /></p>""", result.value)
159159
assert_equal([], result.messages)
160160

161161

0 commit comments

Comments
 (0)