db update

This commit is contained in:
ls
2025-07-30 00:00:39 +08:00
parent fb5e165473
commit 6701e1e1ca
2 changed files with 238 additions and 0 deletions

View File

@@ -45,6 +45,11 @@
</repositories> </repositories>
<dependencies> <dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.5</version>
</dependency>
<!--jeecg-tools--> <!--jeecg-tools-->
<dependency> <dependency>
<groupId>org.jeecgframework.boot</groupId> <groupId>org.jeecgframework.boot</groupId>

View File

@@ -0,0 +1,233 @@
/*
* Ant Group
* Copyright (c) 2004-2025 All Rights Reserved.
*/
package org.jeecg.common.util;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.File;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.cos.COSDictionary;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class PDFUtil {
// 定义需要识别的标题列表
private static final List<String> TARGET_TITLES = Arrays.asList(
"辐照试验委托书",
"沟通记录表",
"合同/委托书等评审表",
"试验大纲评审表",
"试验流程检查单",
"辐照试验更改申请表",
"满意度调查表",
"辐照试验计划表",
"辐照试验(钻源)及退火过程记录",
"测试过程记录",
"试验报告评审表",
"供方财产登记表"
);
public static void main(String[] args) {
String inputFilePath = "/Users/lise/KC-2025-JL-25.pdf";
String outputDir = "output/";
try {
splitPdfBySpecificTitles(inputFilePath, outputDir);
System.out.println("PDF分割完成");
} catch (IOException e) {
System.err.println("处理PDF时出错: " + e.getMessage());
e.printStackTrace();
}
}
public static void splitPdfBySpecificTitles(String inputFilePath, String outputDir) throws IOException {
// 确保输出目录存在
File outputDirectory = new File(outputDir);
if (!outputDirectory.exists()) {
outputDirectory.mkdirs();
}
// 加载原始PDF
File inputFile = new File(inputFilePath);
try (PDDocument originalDoc = Loader.loadPDF(inputFile)) {
// 创建自定义的TextStripper来提取标题
TitleDetector titleDetector = new TitleDetector();
// 存储不同标题对应的文档
Map<String, PDDocument> titleDocuments = new HashMap<>();
// 当前处理的文档标题
String currentTitle = "未分类文档";
// 逐页处理
for (int pageNum = 0; pageNum < originalDoc.getNumberOfPages(); pageNum++) {
PDPage originalPage = originalDoc.getPage(pageNum);
// 设置要处理的页面范围
titleDetector.setStartPage(pageNum + 1);
titleDetector.setEndPage(pageNum + 1);
// 提取当前页的标题
titleDetector.getText(originalDoc);
String detectedTitle = titleDetector.getDetectedTitle();
// 检查是否匹配目标标题
boolean isTargetTitle = false;
for (String targetTitle : TARGET_TITLES) {
if (detectedTitle != null && detectedTitle.contains(targetTitle)) {
currentTitle = targetTitle;
isTargetTitle = true;
break;
}
}
if (!isTargetTitle) {
// 如果不是目标标题,保持当前分类
System.out.println("页面 " + (pageNum + 1) + " 未检测到目标标题,归入: " + currentTitle);
continue;
}
// 获取或创建对应标题的文档
PDDocument targetDoc = titleDocuments.get(currentTitle);
if (targetDoc == null) {
targetDoc = new PDDocument();
titleDocuments.put(currentTitle, targetDoc);
}
// 创建新页面并复制内容
PDPage newPage = new PDPage(new COSDictionary(originalPage.getCOSObject()));
newPage.setResources(originalPage.getResources());
targetDoc.addPage(newPage);
}
// 保存所有文档
for (Map.Entry<String, PDDocument> entry : titleDocuments.entrySet()) {
String title = entry.getKey();
PDDocument doc = entry.getValue();
// 清理文件名
String safeTitle = title.replaceAll("[\\\\/:*?\"<>|]", "_");
String outputPath = outputDir + safeTitle + ".pdf";
doc.save(outputPath);
doc.close();
System.out.println("已创建: " + outputPath);
}
}
}
/**
* 自定义TextStripper用于检测特定标题
*/
private static class TitleDetector extends PDFTextStripper {
private String detectedTitle = null;
private List<TextPosition> titleTextPositions = new ArrayList<>();
public TitleDetector() throws IOException {
super();
this.setSortByPosition(true); // 按位置排序文本
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
// 获取页面高度
PDRectangle pageSize = getCurrentPage().getMediaBox();
float pageHeight = pageSize.getHeight();
// 只处理页面顶部20%的区域
float titleAreaHeight = pageHeight * 0.2f;
for (TextPosition textPosition : textPositions) {
float textY = textPosition.getTextMatrix().getTranslateY();
// 如果文本在标题区域内
if (textY > (pageHeight - titleAreaHeight)) {
titleTextPositions.add(textPosition);
}
}
}
@Override
public String getText(PDDocument document) throws IOException {
// 重置状态
detectedTitle = null;
titleTextPositions.clear();
// 处理文档以提取文本
super.getText(document);
// 从收集的TextPosition重建标题
if (!titleTextPositions.isEmpty()) {
StringBuilder titleBuilder = new StringBuilder();
TextPosition lastPosition = null;
for (TextPosition position : titleTextPositions) {
// 如果不是连续文本,添加空格
if (lastPosition != null &&
position.getTextMatrix().getTranslateX() >
lastPosition.getTextMatrix().getTranslateX() + lastPosition.getWidth()) {
titleBuilder.append(" ");
}
titleBuilder.append(position.getUnicode());
lastPosition = position;
}
detectedTitle = titleBuilder.toString().trim();
}
return detectedTitle != null ? detectedTitle : "";
}
public String getDetectedTitle() {
return detectedTitle;
}
}
}