Fix: the pict type picture was not processed in the docx (#19305)
Co-authored-by: zqgame <zqgame@zqgame.local>
This commit is contained in:
@@ -238,9 +238,11 @@ class WordExtractor(BaseExtractor):
|
|||||||
paragraph_content = []
|
paragraph_content = []
|
||||||
for run in paragraph.runs:
|
for run in paragraph.runs:
|
||||||
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
|
if hasattr(run.element, "tag") and isinstance(run.element.tag, str) and run.element.tag.endswith("r"):
|
||||||
|
# Process drawing type images
|
||||||
drawing_elements = run.element.findall(
|
drawing_elements = run.element.findall(
|
||||||
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
|
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing"
|
||||||
)
|
)
|
||||||
|
has_drawing = False
|
||||||
for drawing in drawing_elements:
|
for drawing in drawing_elements:
|
||||||
blip_elements = drawing.findall(
|
blip_elements = drawing.findall(
|
||||||
".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
|
".//{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
|
||||||
@@ -252,6 +254,34 @@ class WordExtractor(BaseExtractor):
|
|||||||
if embed_id:
|
if embed_id:
|
||||||
image_part = doc.part.related_parts.get(embed_id)
|
image_part = doc.part.related_parts.get(embed_id)
|
||||||
if image_part in image_map:
|
if image_part in image_map:
|
||||||
|
has_drawing = True
|
||||||
|
paragraph_content.append(image_map[image_part])
|
||||||
|
# Process pict type images
|
||||||
|
shape_elements = run.element.findall(
|
||||||
|
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pict"
|
||||||
|
)
|
||||||
|
for shape in shape_elements:
|
||||||
|
# Find image data in VML
|
||||||
|
shape_image = shape.find(
|
||||||
|
".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}binData"
|
||||||
|
)
|
||||||
|
if shape_image is not None and shape_image.text:
|
||||||
|
image_id = shape_image.get(
|
||||||
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||||
|
)
|
||||||
|
if image_id and image_id in doc.part.rels:
|
||||||
|
image_part = doc.part.rels[image_id].target_part
|
||||||
|
if image_part in image_map and not has_drawing:
|
||||||
|
paragraph_content.append(image_map[image_part])
|
||||||
|
# Find imagedata element in VML
|
||||||
|
image_data = shape.find(".//{urn:schemas-microsoft-com:vml}imagedata")
|
||||||
|
if image_data is not None:
|
||||||
|
image_id = image_data.get("id") or image_data.get(
|
||||||
|
"{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
||||||
|
)
|
||||||
|
if image_id and image_id in doc.part.rels:
|
||||||
|
image_part = doc.part.rels[image_id].target_part
|
||||||
|
if image_part in image_map and not has_drawing:
|
||||||
paragraph_content.append(image_map[image_part])
|
paragraph_content.append(image_map[image_part])
|
||||||
if run.text.strip():
|
if run.text.strip():
|
||||||
paragraph_content.append(run.text.strip())
|
paragraph_content.append(run.text.strip())
|
||||||
|
Reference in New Issue
Block a user