From 6701e1e1ca7787f39a2984a5c5d48cc35ba4f775 Mon Sep 17 00:00:00 2001 From: ls Date: Wed, 30 Jul 2025 00:00:39 +0800 Subject: [PATCH] db update --- physical-base-core/pom.xml | 5 + .../java/org/jeecg/common/util/PDFUtil.java | 233 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 physical-base-core/src/main/java/org/jeecg/common/util/PDFUtil.java diff --git a/physical-base-core/pom.xml b/physical-base-core/pom.xml index 9bcd854..a14fff1 100644 --- a/physical-base-core/pom.xml +++ b/physical-base-core/pom.xml @@ -45,6 +45,11 @@ + + org.apache.pdfbox + pdfbox + 3.0.5 + org.jeecgframework.boot diff --git a/physical-base-core/src/main/java/org/jeecg/common/util/PDFUtil.java b/physical-base-core/src/main/java/org/jeecg/common/util/PDFUtil.java new file mode 100644 index 0000000..7545ab3 --- /dev/null +++ b/physical-base-core/src/main/java/org/jeecg/common/util/PDFUtil.java @@ -0,0 +1,233 @@ +/* + * Ant Group + * Copyright (c) 2004-2025 All Rights Reserved. + */ +package org.jeecg.common.util; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; +import org.apache.pdfbox.text.PDFTextStripper; + +import java.io.File; +import java.io.IOException; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.cos.COSDictionary; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class PDFUtil { + + // 定义需要识别的标题列表 + private static final List TARGET_TITLES = Arrays.asList( + "辐照试验委托书", + "沟通记录表", + "合同/委托书等评审表", + "试验大纲评审表", + "试验流程检查单", + "辐照试验更改申请表", + "满意度调查表", + "辐照试验计划表", + "辐照试验(钻源)及退火过程记录", + "测试过程记录", + "试验报告评审表", + "供方财产登记表" + ); + + public static void main(String[] args) { + String inputFilePath = "/Users/lise/KC-2025-JL-25.pdf"; + String outputDir = "output/"; + + try { + splitPdfBySpecificTitles(inputFilePath, outputDir); + System.out.println("PDF分割完成!"); + } catch (IOException e) { + System.err.println("处理PDF时出错: " + e.getMessage()); + e.printStackTrace(); + } + } + + public static void splitPdfBySpecificTitles(String inputFilePath, String outputDir) throws IOException { + // 确保输出目录存在 + File outputDirectory = new File(outputDir); + if (!outputDirectory.exists()) { + outputDirectory.mkdirs(); + } + + // 加载原始PDF + File inputFile = new File(inputFilePath); + try (PDDocument originalDoc = Loader.loadPDF(inputFile)) { + // 创建自定义的TextStripper来提取标题 + TitleDetector titleDetector = new TitleDetector(); + + // 存储不同标题对应的文档 + Map titleDocuments = new HashMap<>(); + + // 当前处理的文档标题 + String currentTitle = "未分类文档"; + + // 逐页处理 + for (int pageNum = 0; pageNum < originalDoc.getNumberOfPages(); pageNum++) { + PDPage originalPage = originalDoc.getPage(pageNum); + + // 设置要处理的页面范围 + titleDetector.setStartPage(pageNum + 1); + titleDetector.setEndPage(pageNum + 1); + + // 提取当前页的标题 + titleDetector.getText(originalDoc); + String detectedTitle = titleDetector.getDetectedTitle(); + + // 检查是否匹配目标标题 + boolean isTargetTitle = false; + for (String targetTitle : TARGET_TITLES) { + if (detectedTitle != null && detectedTitle.contains(targetTitle)) { + currentTitle = targetTitle; + isTargetTitle = true; + break; + } + } + + if (!isTargetTitle) { + // 如果不是目标标题,保持当前分类 + System.out.println("页面 " + (pageNum + 1) + " 未检测到目标标题,归入: " + currentTitle); + continue; + } + + // 获取或创建对应标题的文档 + PDDocument targetDoc = titleDocuments.get(currentTitle); + if (targetDoc == null) { + targetDoc = new PDDocument(); + titleDocuments.put(currentTitle, targetDoc); + } + + // 创建新页面并复制内容 + PDPage newPage = new PDPage(new COSDictionary(originalPage.getCOSObject())); + newPage.setResources(originalPage.getResources()); + targetDoc.addPage(newPage); + } + + // 保存所有文档 + for (Map.Entry entry : titleDocuments.entrySet()) { + String title = entry.getKey(); + PDDocument doc = entry.getValue(); + + // 清理文件名 + String safeTitle = title.replaceAll("[\\\\/:*?\"<>|]", "_"); + String outputPath = outputDir + safeTitle + ".pdf"; + + doc.save(outputPath); + doc.close(); + System.out.println("已创建: " + outputPath); + } + } + } + + /** + * 自定义TextStripper用于检测特定标题 + */ + private static class TitleDetector extends PDFTextStripper { + private String detectedTitle = null; + private List titleTextPositions = new ArrayList<>(); + + public TitleDetector() throws IOException { + super(); + this.setSortByPosition(true); // 按位置排序文本 + } + + @Override + protected void writeString(String text, List textPositions) throws IOException { + // 获取页面高度 + PDRectangle pageSize = getCurrentPage().getMediaBox(); + float pageHeight = pageSize.getHeight(); + + // 只处理页面顶部20%的区域 + float titleAreaHeight = pageHeight * 0.2f; + + for (TextPosition textPosition : textPositions) { + float textY = textPosition.getTextMatrix().getTranslateY(); + + // 如果文本在标题区域内 + if (textY > (pageHeight - titleAreaHeight)) { + titleTextPositions.add(textPosition); + } + } + } + + @Override + public String getText(PDDocument document) throws IOException { + // 重置状态 + detectedTitle = null; + titleTextPositions.clear(); + + // 处理文档以提取文本 + super.getText(document); + + // 从收集的TextPosition重建标题 + if (!titleTextPositions.isEmpty()) { + StringBuilder titleBuilder = new StringBuilder(); + TextPosition lastPosition = null; + + for (TextPosition position : titleTextPositions) { + // 如果不是连续文本,添加空格 + if (lastPosition != null && + position.getTextMatrix().getTranslateX() > + lastPosition.getTextMatrix().getTranslateX() + lastPosition.getWidth()) { + titleBuilder.append(" "); + } + + titleBuilder.append(position.getUnicode()); + lastPosition = position; + } + + detectedTitle = titleBuilder.toString().trim(); + } + + return detectedTitle != null ? detectedTitle : ""; + } + + public String getDetectedTitle() { + return detectedTitle; + } + } +} \ No newline at end of file