diff --git a/Official_Acoount/LazyRAG_MinerU/rag_mineru.py b/Official_Acoount/LazyRAG_MinerU/rag_mineru.py new file mode 100644 index 0000000..9d9e54e --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/rag_mineru.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F821 + +import lazyllm +from lazyllm import LOG +from lazyllm import pipeline, parallel, bind, OnlineEmbeddingModule, SentenceSplitter, Document, Retriever, Reranker +from utils.magic_pdf_reader import MagicPDFReader +from utils.magic_pdf_transform import MagicPDFTransform + +# 注意:使用前须配置LLM服务的API Key环境变量 如 export LAZYLLM_DEEPSEEK_API_KEY=""、export LAZYLLM_QWEN_API_KEY="" +prompt = 'You will play the role of an AI question-answering assistant and complete a conversation task in which you need to provide your answer based on the given context and question. Please note that if the given context cannot answer the question, do not use your prior knowledge but tell the user that the given context cannot answer the question.' + +documents = Document(dataset_path="", embed=OnlineEmbeddingModule(), manager=False) + +documents.add_reader("**/*.pdf", MagicPDFReader) # ​⚠️​​添加magic-pdf作为pdf文档解析器 +documents.create_node_group(name="magic-pdf", transform=MagicPDFTransform) # ​⚠️​​添加定制节点解析器 + +with pipeline() as ppl: + with parallel().sum as ppl.prl: + prl.retriever1 = Retriever(documents, group_name="magic-pdf", similarity="cosine", topk=3) + prl.retriever2 = Retriever(documents, group_name="magic-pdf", similarity="bm25_chinese", topk=3) + ppl.reranker = Reranker("ModuleReranker", model=OnlineEmbeddingModule(type="rerank"), topk=1, output_format='content', join=True) | bind(query=ppl.input) + ppl.formatter = (lambda nodes, query: dict(context_str=nodes, query=query)) | bind(query=ppl.input) + ppl.llm = lazyllm.OnlineChatModule(stream=False).prompt(lazyllm.ChatPrompter(prompt, extra_keys=["context_str"])) + + +if __name__ == "__main__": + while True: + print("✨ Welcome to your smart assistant ✨") + + query = input("\n🚀 Enter your query (type 'exit' to quit): \n> ") + if query.lower() == "exit": + print("\n👋 Exiting... Thank you for the using!") + break + + print(f"\n✅ Received your query: {query}\n") + + answer = ppl(query) + + print("\n" + "=" * 50) + print("🚀 ANSWER 🚀") + print("=" * 50 + "\n") + print(answer) + print("\n" + "=" * 50 + "\n") \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py b/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py new file mode 100644 index 0000000..53b660e --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/rag_mineru_output.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# flake8: noqa: F821 + +import lazyllm +from lazyllm import LOG +from lazyllm import pipeline, parallel, bind, OnlineEmbeddingModule, SentenceSplitter, Document, Retriever, Reranker +from utils.magic_pdf_reader import MagicPDFReader +from utils.magic_pdf_transform import MagicPDFTransform +from utils.utils import draw_pdf_bbox + +# 注意:使用前须配置LLM服务的API Key环境变量 如 export LAZYLLM_DEEPSEEK_API_KEY=""、export LAZYLLM_QWEN_API_KEY="" +prompt = 'You will play the role of an AI question-answering assistant and complete a conversation task in which you need to provide your answer based on the given context and question. Please note that if the given context cannot answer the question, do not use your prior knowledge but tell the user that the given context cannot answer the question.' + +documents = Document(dataset_path="", embed=OnlineEmbeddingModule(), manager=False) + +documents.add_reader("**/*.pdf", MagicPDFReader) # ​​⚠️​​添加magic-pdf作为pdf文档解析器 +documents.create_node_group(name="magic-pdf", transform=MagicPDFTransform) # ​⚠️​​添加定制节点解析器 + + +with pipeline() as ppl: + with parallel().sum as ppl.prl: + prl.retriever1 = Retriever(documents, group_name="magic-pdf", similarity="cosine", topk=3) + prl.retriever2 = Retriever(documents, group_name="magic-pdf", similarity="bm25_chinese", topk=3) + ppl.reranker = Reranker("ModuleReranker", model=OnlineEmbeddingModule(type="rerank"), topk=1) | bind(query=ppl.input) + ppl.draw_pdf = draw_pdf_bbox | bind(query=ppl.input) # ​​⚠️ 在​pipeline中添加任务:在目标文件中框选召回内容并保存 + ppl.formatter = (lambda nodes, query: dict(context_str=nodes, query=query)) | bind(query=ppl.input) + ppl.llm = lazyllm.OnlineChatModule(stream=False).prompt(lazyllm.ChatPrompter(prompt, extra_keys=["context_str"])) + + +if __name__ == "__main__": + while True: + print("✨ Welcome to your smart assistant ✨") + + query = input("\n🚀 Enter your query (type 'exit' to quit): \n> ") + if query.lower() == "exit": + print("\n👋 Exiting... Thank you for the using!") + break + + print(f"\n✅ Received your query: {query}\n") + + answer = ppl(query) + + print("\n" + "=" * 50) + print("🚀 ANSWER 🚀") + print("=" * 50 + "\n") + print(answer) + print("\n" + "=" * 50 + "\n") \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py new file mode 100644 index 0000000..966176d --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_reader.py @@ -0,0 +1,332 @@ +import os +import torch +import copy +from pathlib import Path +from bs4 import BeautifulSoup +from typing import Dict, List, Optional, Iterable +from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader +from magic_pdf.data.dataset import PymuDocDataset +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.config.enums import SupportedPdfParseMethod +import magic_pdf.model as model_config +from magic_pdf.libs import config_reader +from magic_pdf.config.ocr_content_type import BlockType, ContentType +from magic_pdf.libs.commons import join_path +from magic_pdf.libs import config_reader +from magic_pdf.dict2md import ocr_mkcontent +import unicodedata +from lazyllm.tools.rag import DocNode +from lazyllm import LOG +from utils import get_image_path + +# ⚠️ 使用前需配置环境&下载模型,参考 https://github.com/opendatalab/MinerU + +# add patchs to magic-pdf + +# 配置信息,根据magic-pdf版本自行更新,以下配置对应1.0.1版本 +def read_config(): + config = { + "bucket_info": { + "bucket-name-1": ["ak", "sk", "endpoint"], + "bucket-name-2": ["ak", "sk", "endpoint"] + }, + "models-dir": "", # ​⚠️下载​模型并填入路径 + "layoutreader-model-dir": "", # ​⚠️下载​模型并填入路径 + "layout-config": { + "model": "doclayout_yolo" + }, + "formula-config": { + "mfd_model": "yolo_v8_mfd", + "mfr_model": "unimernet_small", + "enable": False + }, + "table-config": { + "model": "rapid_table", + "enable": True, + "max_time": 400 + }, + "config_version": "1.0.0" + } + + config["device-mode"] = "cuda" if torch.cuda.is_available() else "cpu" + return config + + +config_reader.read_config = read_config + + +def parse_line_spans(para_block, page_idx): + lines_metas = [] + page = page_idx + if 'lines' in para_block: + for line_info in para_block['lines']: + if not line_info['spans']: + continue + line_meta = copy.deepcopy(line_info['spans'][0]) + line_meta.pop('score', None) + if_cross_page = line_meta.pop('cross_page', None) + line_meta['page'] = page + 1 if if_cross_page == True else page + lines_metas.append(line_meta) + return lines_metas + + +def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None): + para_type = para_block['type'] + para_content = {} + + lines_metas = parse_line_spans(para_block, page_idx) + if para_type in [BlockType.Text, BlockType.List, BlockType.Index]: + para_content = { + 'type': 'text', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + } + elif para_type == BlockType.Title: + para_content = { + 'type': 'text', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + 'text_level': 1, + } + elif para_type == BlockType.InterlineEquation: + para_content = { + 'type': 'equation', + 'text': ocr_mkcontent.merge_para_with_text(para_block), + 'text_format': 'latex', + } + elif para_type == BlockType.Image: + para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []} + image_lines_metas = [] + for block in para_block['blocks']: + if block['type'] == BlockType.ImageBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Image: + if span.get('image_path', ''): + para_content['img_path'] = join_path(img_buket_path, span['image_path']) + if block['type'] == BlockType.ImageCaption: + image_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['img_caption'].append(ocr_mkcontent.merge_para_with_text(block)) + if block['type'] == BlockType.ImageFootnote: + image_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['img_footnote'].append(ocr_mkcontent.merge_para_with_text(block)) + para_content['lines'] = image_lines_metas + elif para_type == BlockType.Table: + table_lines_metas = [] + para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []} + for block in para_block['blocks']: + if block['type'] == BlockType.TableBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Table: + + if span.get('latex', ''): + para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n" + elif span.get('html', ''): + para_content['table_body'] = f"\n\n{span['html']}\n\n" + + if span.get('image_path', ''): + para_content['img_path'] = join_path(img_buket_path, span['image_path']) + + if block['type'] == BlockType.TableCaption: + table_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['table_caption'].append(ocr_mkcontent.merge_para_with_text(block)) + if block['type'] == BlockType.TableFootnote: + table_lines_metas.extend(parse_line_spans(block, page_idx)) + para_content['table_footnote'].append(ocr_mkcontent.merge_para_with_text(block)) + para_content['lines'] = table_lines_metas + + para_content['page_idx'] = page_idx + para_content['bbox'] = para_block['bbox'] + if lines_metas: + para_content['lines'] = lines_metas + para_content.pop('lines', []) + + if drop_reason is not None: + para_content['drop_reason'] = drop_reason + + return para_content + + +ocr_mkcontent.para_to_standard_format_v2 = para_to_standard_format_v2 + + +class MagicPDFReader: + """ + PDF 文档解析器,支持文本、图片、表格的解析,返回结构化文档节点。 + """ + + def __init__(self): + self.image_save_path = get_image_path() + + def __call__(self, file: Path, split_documents: Optional[bool] = True, **kwargs) -> List[DocNode]: + return self._load_data(file, split_documents) + + def _load_data(self, file: Path, split_documents: Optional[bool] = True) -> List[DocNode]: + """ + 解析 PDF 并返回结构化文档节点。 + """ + elements = self._parse_pdf_elements(file) + docs = [] + + if split_documents: + for element in elements: + metadata = {"file_name": file.name} + metadata.update({k: v for k, v in element.items() if k != "text"}) + metadata.update({'file_path': str(file)}) + docs.append(DocNode(text=element.get("text", ""), metadata=metadata)) + else: + text_chunks = [el["text"] for el in elements if "text" in el] + docs.append(DocNode(text="\n".join(text_chunks), metadata={"file_name": file.name})) + + LOG.info(f"Successfully parsed {file.name}") + return docs + + def _parse_pdf_elements(self, pdf_path: Path) -> List[dict]: + """ + 解析 PDF 并返回文档元素(文本、表格、图片等)列表。 + """ + image_dir = os.path.basename(self.image_save_path) + os.makedirs(self.image_save_path, exist_ok=True) + + image_writer = FileBasedDataWriter(self.image_save_path) + pdf_bytes = FileBasedDataReader("").read(pdf_path) + + ds = PymuDocDataset(pdf_bytes) + + if ds.classify() == SupportedPdfParseMethod.OCR: + infer_result = ds.apply(doc_analyze, ocr=True) + pipe_result = infer_result.pipe_ocr_mode(image_writer) + else: + infer_result = ds.apply(doc_analyze, ocr=False) + pipe_result = infer_result.pipe_txt_mode(image_writer) + + infer_result.get_infer_res() + return self._extract_content_blocks(pipe_result.get_content_list(image_dir)) + + def _extract_content_blocks(self, content_list) -> List[dict]: + """ + 处理解析结果,提取文本、表格、图片等内容。 + """ + blocks = [] + cur_title = "" + cur_level = -1 + for content in content_list: + block = {} + block["bbox"] = content["bbox"] + block["lines"] = content["lines"] if 'lines' in content else [] + for line in block['lines']: + line['content'] = self._clean_content(line['content']) + if content["type"] == "text": + content["text"] = self._clean_content(content["text"]).strip() + if not content["text"]: + continue + if "text_level" in content: + if cur_title and content["text_level"] > cur_level: + content["title"] = cur_title + cur_title = content["text"] + cur_level = content["text_level"] + else: + if cur_title: + content["title"] = cur_title + block = copy.deepcopy(content) + block["page"] = content["page_idx"] + del block["page_idx"] + blocks.append(block) + elif content["type"] == "image": + if not content["img_path"]: + continue + block["type"] = content["type"] + block["page"] = content["page_idx"] + block["image_path"] = os.path.basename(content["img_path"]) + block['img_caption'] = self._clean_content(content['img_caption']) + block['img_footnote'] = self._clean_content(content['img_footnote']) + if cur_title: + block["title"] = cur_title + blocks.append(block) + elif content["type"] == "table": + block["type"] = content["type"] + block["page"] = content["page_idx"] + block["text"] = self._html_table_to_markdown(self._clean_content(content["table_body"])) if "table_body" in content else "" + if cur_title: + block["title"] = cur_title + block['table_caption'] = self._clean_content(content['table_caption']) + block['table_footnote'] = self._clean_content(content['table_footnote']) + blocks.append(block) + return blocks + + def _clean_content(self, content) -> str: + """ + 清理文本内容,处理编码问题并进行 Unicode 归一化。 + """ + if isinstance(content, str): + content = content.encode("utf-8", "replace").decode("utf-8") + return unicodedata.normalize("NFKC", content) + if isinstance(content, list): + return [self._clean_content(t) for t in content] + return content + + def _html_table_to_markdown(self, html_table) -> str: + """ + 将 HTML 表格转换为 Markdown 格式。 + """ + try: + soup = BeautifulSoup(html_table.strip(), 'html.parser') + table = soup.find('table') + if not table: + raise ValueError("No found in the HTML.") + + rows = [] + max_cols = 0 + + for row in table.find_all('tr'): + cells = [] + for cell in row.find_all(['td', 'th']): + rowspan = int(cell.get('rowspan', 1)) + colspan = int(cell.get('colspan', 1)) + text = cell.get_text(strip=True) + + for _ in range(colspan): + cells.append({'text': text, 'rowspan': rowspan}) + rows.append(cells) + max_cols = max(max_cols, len(cells)) + + expanded_rows = [] + rowspan_tracker = [0] * max_cols + for row in rows: + expanded_row = [] + col_idx = 0 + for cell in row: + while col_idx < max_cols and rowspan_tracker[col_idx] > 0: + expanded_row.append(None) + rowspan_tracker[col_idx] -= 1 + col_idx += 1 + + expanded_row.append(cell['text']) + if cell['rowspan'] > 1: + rowspan_tracker[col_idx] = cell['rowspan'] - 1 + col_idx += 1 + + while col_idx < max_cols: + if rowspan_tracker[col_idx] > 0: + expanded_row.append(None) + rowspan_tracker[col_idx] -= 1 + else: + expanded_row.append("") + col_idx += 1 + + expanded_rows.append(expanded_row) + + headers = expanded_rows[0] + body_rows = expanded_rows[1:] + + markdown = '' + if headers: + markdown += '| ' + ' | '.join(h if h else '' for h in headers) + ' |\n' + markdown += '| ' + ' | '.join(['-' * (len(h) if h else 3) for h in headers]) + ' |\n' + for row in body_rows: + markdown += '| ' + ' | '.join(cell if cell else '' for cell in row) + ' |\n' + + return markdown + + except Exception as e: + print(f"Error parsing table: {e}") + return '' + \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_transform.py b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_transform.py new file mode 100644 index 0000000..3261cc8 --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/utils/magic_pdf_transform.py @@ -0,0 +1,115 @@ +from typing import Any, List, Sequence +from lazyllm.tools.rag import NodeTransform, DocNode +from typing import Union + + +class MagicPDFTransform(NodeTransform): + """ + 专门用于magic-pdf解析结果的节点转换方法 + 可自定义节点转化方法 + 现根据章节标题和限定长度进行节点聚合 + """ + def __init__(self, **kwargs): + super().__init__() + + def transform(self, document: DocNode, **kwargs) -> List[Union[str, DocNode]]: + return + + def batch_forward(self, documents, node_group, **kwargs): + nodes = ConsolidationTextNodeParser.parse_nodes(documents) + for node in nodes: + node.excluded_embed_metadata_keys = ['bbox', 'lines'] + node._group = node_group + return nodes + + +class ConsolidationTextNodeParser: + """ + 遍历 nodes,将所有非 title 类型的节点合并。 + """ + + @classmethod + def class_name(cls) -> str: + return "ConsolidationTextNodeParser" + + @staticmethod + def parse_nodes(nodes: List[DocNode], **kwargs: Any) -> List[DocNode]: + """ + 解析节点,合并非 title 类型的文本节点。 + """ + for node in nodes: + node._metadata['bbox'] = [{'page': node.metadata['page'], 'bbox': node.metadata['bbox']}] + grouped_nodes = ConsolidationTextNodeParser._group_nodes(nodes) + return [node for group in grouped_nodes for node in ConsolidationTextNodeParser._merge_text_nodes(group)] + + @staticmethod + def _group_nodes(nodes: List["DocNode"]) -> List[List["DocNode"]]: + """ + 根据 text_level 进行分组,确保每组不会超过 4096 个字符。 + """ + grouped_nodes = [] + current_group = [] + + for node in nodes: + if node.metadata.get("text_level", 0): + if current_group: + grouped_nodes.append(current_group) + current_group = [node] + elif len("\n\n".join(n._content for n in current_group + [node])) >= 4096: + grouped_nodes.append(current_group) + current_group = [node] + else: + current_group.append(node) + + if current_group: + grouped_nodes.append(current_group) + + return grouped_nodes + + @staticmethod + def _merge_text_nodes(nodes: List["DocNode"]) -> List["DocNode"]: + """ + 合并同一组中的文本节点,将内容和元数据合并到前一个节点中。 + """ + merged_nodes = [] + for node in nodes: + if not merged_nodes: + merged_nodes.append(node) + else: + last_node = merged_nodes[-1] + last_node._content += f"\n\n{node._content}" + + if last_node.metadata.get("bbox"): + last_node.metadata["bbox"].extend(node.metadata.get("bbox", [])) + + if last_node.metadata.get("lines"): + last_node.metadata["lines"].extend(node.metadata.get("lines", [])) + + return merged_nodes + + @staticmethod + def _merge_bbox(top_bbox: List, bottom_bbox: List) -> List: + """ + 合并两个坐标框(bbox)。 + bbox 格式: [left_top_x, left_top_y, right_bottom_x, right_bottom_y] + """ + assert len(top_bbox) == 4, f"每个 bbox 必须包含 4 个值\n top_bbox: {top_bbox}" + if len(bottom_bbox) != 4: + bottom_bbox = top_bbox + + assert top_bbox[0] <= top_bbox[2] and top_bbox[1] <= top_bbox[3], f"top_bbox 格式不正确:{top_bbox}" + assert bottom_bbox[0] <= bottom_bbox[2] and bottom_bbox[1] <= bottom_bbox[3], "bottom_bbox 格式不正确" + + # 确保 top_bbox 的 y 坐标较小 + if top_bbox[1] > bottom_bbox[1]: + # LOG.warning(f"BBox 顺序需要检查:交换 top_bbox: {top_bbox} 和 bottom_bbox: {top_bbox}") + top_bbox, bottom_bbox = bottom_bbox, top_bbox + + # 合并两个 bbox + new_bbox = [ + max(top_bbox[0], bottom_bbox[0]), # left_top_x + top_bbox[1], # left_top_y + max(top_bbox[2], bottom_bbox[2]), # right_bottom_x + bottom_bbox[3], # right_bottom_y + ] + return new_bbox \ No newline at end of file diff --git a/Official_Acoount/LazyRAG_MinerU/utils/utils.py b/Official_Acoount/LazyRAG_MinerU/utils/utils.py new file mode 100644 index 0000000..725b825 --- /dev/null +++ b/Official_Acoount/LazyRAG_MinerU/utils/utils.py @@ -0,0 +1,52 @@ + +import fitz +import os +from pathlib import Path + + +def draw_pdf_bbox(nodes, query, **kwargs): + """在召回PDF文档中绘制box并标注query""" + for node in nodes: + bbox_list = node.metadata['bbox'] + output_path = os.path.join(get_pdf_output_path(), f"query[{query}] -- {node.metadata['file_name']}") + draw_bboxes_on_pdf(node.metadata['file_path'], bbox_list, output_path, query) + return "\n".join([node.get_content() for node in nodes]) + + +def draw_bboxes_on_pdf(input_pdf_path, bbox_list, output_pdf_path, query): + # 打开PDF文档 + pdf_document = fitz.open(input_pdf_path) + + # 遍历每一页 + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + + # 遍历bbox列表 + for bbox in bbox_list: + if bbox['page'] == page_num: # 检查bbox是否属于当前页 + rect = fitz.Rect(*bbox['bbox']) # 创建矩形框 + page.draw_rect(rect, color=(1, 0, 0), width=2) # 绘制红色矩形框 + + # 保存带有bbox的PDF文档 + pdf_document.save(output_pdf_path) + pdf_document.close() + + +def get_project_path(): + current_path = Path(__file__).resolve() + for parent in current_path.parents: + if parent.name == "LazyRAG_MinerU": + return parent + raise FileNotFoundError("Project 'LazyRAG_MinerU' not found in the directory tree.") + + +def get_pdf_output_path(): + path = os.path.join(get_project_path(), "data/draw_box_pdf") + os.makedirs(path, exist_ok=True) + return path + + +def get_image_path(): + path = os.path.join(get_project_path(), "data/images") + os.makedirs(path, exist_ok=True) + return path \ No newline at end of file