db update
This commit is contained in:
@@ -45,6 +45,11 @@
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.pdfbox</groupId>
|
||||
<artifactId>pdfbox</artifactId>
|
||||
<version>3.0.5</version>
|
||||
</dependency>
|
||||
<!--jeecg-tools-->
|
||||
<dependency>
|
||||
<groupId>org.jeecgframework.boot</groupId>
|
||||
|
||||
@@ -0,0 +1,233 @@
|
||||
/*
|
||||
* Ant Group
|
||||
* Copyright (c) 2004-2025 All Rights Reserved.
|
||||
*/
|
||||
package org.jeecg.common.util;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PDFUtil {
|
||||
|
||||
// 定义需要识别的标题列表
|
||||
private static final List<String> TARGET_TITLES = Arrays.asList(
|
||||
"辐照试验委托书",
|
||||
"沟通记录表",
|
||||
"合同/委托书等评审表",
|
||||
"试验大纲评审表",
|
||||
"试验流程检查单",
|
||||
"辐照试验更改申请表",
|
||||
"满意度调查表",
|
||||
"辐照试验计划表",
|
||||
"辐照试验(钻源)及退火过程记录",
|
||||
"测试过程记录",
|
||||
"试验报告评审表",
|
||||
"供方财产登记表"
|
||||
);
|
||||
|
||||
public static void main(String[] args) {
|
||||
String inputFilePath = "/Users/lise/KC-2025-JL-25.pdf";
|
||||
String outputDir = "output/";
|
||||
|
||||
try {
|
||||
splitPdfBySpecificTitles(inputFilePath, outputDir);
|
||||
System.out.println("PDF分割完成!");
|
||||
} catch (IOException e) {
|
||||
System.err.println("处理PDF时出错: " + e.getMessage());
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static void splitPdfBySpecificTitles(String inputFilePath, String outputDir) throws IOException {
|
||||
// 确保输出目录存在
|
||||
File outputDirectory = new File(outputDir);
|
||||
if (!outputDirectory.exists()) {
|
||||
outputDirectory.mkdirs();
|
||||
}
|
||||
|
||||
// 加载原始PDF
|
||||
File inputFile = new File(inputFilePath);
|
||||
try (PDDocument originalDoc = Loader.loadPDF(inputFile)) {
|
||||
// 创建自定义的TextStripper来提取标题
|
||||
TitleDetector titleDetector = new TitleDetector();
|
||||
|
||||
// 存储不同标题对应的文档
|
||||
Map<String, PDDocument> titleDocuments = new HashMap<>();
|
||||
|
||||
// 当前处理的文档标题
|
||||
String currentTitle = "未分类文档";
|
||||
|
||||
// 逐页处理
|
||||
for (int pageNum = 0; pageNum < originalDoc.getNumberOfPages(); pageNum++) {
|
||||
PDPage originalPage = originalDoc.getPage(pageNum);
|
||||
|
||||
// 设置要处理的页面范围
|
||||
titleDetector.setStartPage(pageNum + 1);
|
||||
titleDetector.setEndPage(pageNum + 1);
|
||||
|
||||
// 提取当前页的标题
|
||||
titleDetector.getText(originalDoc);
|
||||
String detectedTitle = titleDetector.getDetectedTitle();
|
||||
|
||||
// 检查是否匹配目标标题
|
||||
boolean isTargetTitle = false;
|
||||
for (String targetTitle : TARGET_TITLES) {
|
||||
if (detectedTitle != null && detectedTitle.contains(targetTitle)) {
|
||||
currentTitle = targetTitle;
|
||||
isTargetTitle = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isTargetTitle) {
|
||||
// 如果不是目标标题,保持当前分类
|
||||
System.out.println("页面 " + (pageNum + 1) + " 未检测到目标标题,归入: " + currentTitle);
|
||||
continue;
|
||||
}
|
||||
|
||||
// 获取或创建对应标题的文档
|
||||
PDDocument targetDoc = titleDocuments.get(currentTitle);
|
||||
if (targetDoc == null) {
|
||||
targetDoc = new PDDocument();
|
||||
titleDocuments.put(currentTitle, targetDoc);
|
||||
}
|
||||
|
||||
// 创建新页面并复制内容
|
||||
PDPage newPage = new PDPage(new COSDictionary(originalPage.getCOSObject()));
|
||||
newPage.setResources(originalPage.getResources());
|
||||
targetDoc.addPage(newPage);
|
||||
}
|
||||
|
||||
// 保存所有文档
|
||||
for (Map.Entry<String, PDDocument> entry : titleDocuments.entrySet()) {
|
||||
String title = entry.getKey();
|
||||
PDDocument doc = entry.getValue();
|
||||
|
||||
// 清理文件名
|
||||
String safeTitle = title.replaceAll("[\\\\/:*?\"<>|]", "_");
|
||||
String outputPath = outputDir + safeTitle + ".pdf";
|
||||
|
||||
doc.save(outputPath);
|
||||
doc.close();
|
||||
System.out.println("已创建: " + outputPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 自定义TextStripper用于检测特定标题
|
||||
*/
|
||||
private static class TitleDetector extends PDFTextStripper {
|
||||
private String detectedTitle = null;
|
||||
private List<TextPosition> titleTextPositions = new ArrayList<>();
|
||||
|
||||
public TitleDetector() throws IOException {
|
||||
super();
|
||||
this.setSortByPosition(true); // 按位置排序文本
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
// 获取页面高度
|
||||
PDRectangle pageSize = getCurrentPage().getMediaBox();
|
||||
float pageHeight = pageSize.getHeight();
|
||||
|
||||
// 只处理页面顶部20%的区域
|
||||
float titleAreaHeight = pageHeight * 0.2f;
|
||||
|
||||
for (TextPosition textPosition : textPositions) {
|
||||
float textY = textPosition.getTextMatrix().getTranslateY();
|
||||
|
||||
// 如果文本在标题区域内
|
||||
if (textY > (pageHeight - titleAreaHeight)) {
|
||||
titleTextPositions.add(textPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument document) throws IOException {
|
||||
// 重置状态
|
||||
detectedTitle = null;
|
||||
titleTextPositions.clear();
|
||||
|
||||
// 处理文档以提取文本
|
||||
super.getText(document);
|
||||
|
||||
// 从收集的TextPosition重建标题
|
||||
if (!titleTextPositions.isEmpty()) {
|
||||
StringBuilder titleBuilder = new StringBuilder();
|
||||
TextPosition lastPosition = null;
|
||||
|
||||
for (TextPosition position : titleTextPositions) {
|
||||
// 如果不是连续文本,添加空格
|
||||
if (lastPosition != null &&
|
||||
position.getTextMatrix().getTranslateX() >
|
||||
lastPosition.getTextMatrix().getTranslateX() + lastPosition.getWidth()) {
|
||||
titleBuilder.append(" ");
|
||||
}
|
||||
|
||||
titleBuilder.append(position.getUnicode());
|
||||
lastPosition = position;
|
||||
}
|
||||
|
||||
detectedTitle = titleBuilder.toString().trim();
|
||||
}
|
||||
|
||||
return detectedTitle != null ? detectedTitle : "";
|
||||
}
|
||||
|
||||
public String getDetectedTitle() {
|
||||
return detectedTitle;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user