This commit is contained in:
ls
2024-10-15 10:52:55 +08:00
parent e1f74f16de
commit a38c86ebee
5 changed files with 281 additions and 76 deletions

View File

@@ -45,11 +45,25 @@
<artifactId>drag-free-springboot3</artifactId>
<version>1.1.2</version>
</dependency>
<!-- chatgpt -->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-core</artifactId>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-http</artifactId>
</dependency>
<!-- chatgpt -->
<dependency>
<groupId>org.jeecgframework.boot</groupId>
<artifactId>jeecg-boot-starter3-chatgpt</artifactId>
<version>3.7.0</version>
<exclusions>
<exclusion>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->

View File

@@ -1,77 +1,145 @@
package org.jeecg.modules.database.entity;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.math.BigDecimal;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import com.baomidou.mybatisplus.annotation.TableLogic;
import lombok.Data;
import com.fasterxml.jackson.annotation.JsonFormat;
import org.springframework.format.annotation.DateTimeFormat;
import org.jeecgframework.poi.excel.annotation.Excel;
import org.jeecg.common.aspect.annotation.Dict;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.experimental.Accessors;
import org.jeecgframework.poi.excel.annotation.Excel;
import org.springframework.format.annotation.DateTimeFormat;
import java.io.Serializable;
import java.util.Date;
/**
* @Description: 爬虫记录
* @Author: jeecg-boot
* @Date: 2024-09-03
* @Date: 2024-10-14
* @Version: V1.0
*/
@Data
@TableName("crawler_record")
@Accessors(chain = true)
@EqualsAndHashCode(callSuper = false)
@Schema(description="爬虫记录")
@Schema(description = "爬虫记录")
public class CrawlerRecord implements Serializable {
private static final long serialVersionUID = 1L;
/**主键*/
@TableId(type = IdType.ASSIGN_ID)
/**
* 主键
*/
@TableId(type = IdType.ASSIGN_ID)
@Schema(description = "主键")
private String id;
/**创建人*/
/**
* 创建人
*/
@Schema(description = "创建人")
private String createBy;
/**创建日期*/
@JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern="yyyy-MM-dd HH:mm:ss")
/**
* 创建日期
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@Schema(description = "创建日期")
private Date createTime;
/**更新人*/
/**
* 更新人
*/
@Schema(description = "更新人")
private String updateBy;
/**更新日期*/
@JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern="yyyy-MM-dd HH:mm:ss")
/**
* 更新日期
*/
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss")
@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss")
@Schema(description = "更新日期")
private Date updateTime;
/**所属部门*/
/**
* 所属部门
*/
@Schema(description = "所属部门")
private String sysOrgCode;
/**来源*/
@Excel(name = "来源", width = 15)
@Schema(description = "来源")
private String source;
/**编号*/
@Excel(name = "编号", width = 15)
/**
* 类型
*/
@Excel(name = "类型", width = 15)
@Schema(description = "类型")
private String type;
/**
* 编号
*/
@Excel(name = "编号", width = 15)
@Schema(description = "编号")
private String code;
/**名称*/
@Excel(name = "名称", width = 15)
@Schema(description = "名称")
private String name;
/**文件名*/
@Excel(name = "文件名", width = 15)
@Schema(description = "文件名")
private String fileName;
/**文件ID*/
@Excel(name = "文件ID", width = 15)
@Schema(description = "文件ID")
private String fileId;
/**
* 制造商
*/
@Excel(name = "制造商", width = 15)
@Schema(description = "制造商")
private String manufacturer;
/**
* 文件
*/
@Excel(name = "文件", width = 15)
@Schema(description = "文件")
private String fileUrl;
/**
* 测试类型
*/
@Excel(name = "测试类型", width = 15)
@Schema(description = "测试类型")
private String functionType;
/**
* 测试方式
*/
@Excel(name = "测试方式", width = 15)
@Schema(description = "测试方式")
private String testMethod;
/**
* 分组
*/
@Excel(name = "分组", width = 15)
@Schema(description = "分组")
private String category;
/**
* 二级分组
*/
@Excel(name = "二级分组", width = 15)
@Schema(description = "二级分组")
private String subCategory;
/**
* 文档日期
*/
@Excel(name = "文档日期", width = 15)
@Schema(description = "文档日期")
private String reportDate;
/**
* 测试技术
*/
@Excel(name = "测试技术", width = 15)
@Schema(description = "测试技术")
private String technology;
/**
* 报告来源
*/
@Excel(name = "报告来源", width = 15)
@Schema(description = "报告来源")
private String reportSource;
/**
* 辐射测试类型
*/
@Excel(name = "辐射测试类型", width = 15)
@Schema(description = "辐射测试类型")
private String radiationTestType;
/**
* 报告ID
*/
@Excel(name = "报告ID", width = 15)
@Schema(description = "报告ID")
private String reportId;
}

View File

@@ -1,10 +1,21 @@
package org.jeecg.modules.database.service.impl;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.IoUtil;
import cn.hutool.core.thread.ThreadUtil;
import cn.hutool.core.util.ObjUtil;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.core.util.URLUtil;
import cn.hutool.http.Header;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.baomidou.mybatisplus.core.toolkit.Wrappers;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import lombok.extern.slf4j.Slf4j;
import okhttp3.*;
@@ -24,9 +35,16 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import static org.jeecg.modules.online.auth.b.a.f;
import static org.jeecg.modules.online.auth.b.a.i;
@Slf4j
@Service
@@ -40,6 +58,22 @@ public class CrawlerRecordServiceImpl extends ServiceImpl<CrawlerRecordMapper, C
private String esaradKey = "crawler-esarad";
private String radhomeKey = "crawler-radhome";
public static void main(String[] args) {
HttpResponse execute = HttpRequest.get("https://esarad.esa.int/?id=76&handler=DownloadDb").execute();
final List<String> dispositions = execute.headerList(Header.CONTENT_DISPOSITION.getValue());
String fileName = null;
if (CollUtil.isNotEmpty(dispositions)) {
for (String disposition : dispositions) {
fileName = ReUtil.getGroup1("filename" + "=([^;]+)", disposition);
}
System.out.println(URLUtil.decode("N2920A%20TID_1009_01.pdf", Charset.defaultCharset()));
System.out.println(fileName);
// filename* 采用了 RFC 5987 中规定的编码方式,优先读取
}
}
/**
* https://esarad.esa.int/
*/
@@ -51,7 +85,8 @@ public class CrawlerRecordServiceImpl extends ServiceImpl<CrawlerRecordMapper, C
throw new RuntimeException("爬虫任务执行中");
}
redisUtil.set(esaradKey, "T", 24 * 60 * 60);
List<Map<String, String>> tableData = new ArrayList<>();
List<CrawlerRecord> tableData = new ArrayList<>();
Document doc = Jsoup.connect("https://esarad.esa.int").get();
Element table = doc.getElementById("dtReports");
@@ -78,33 +113,71 @@ public class CrawlerRecordServiceImpl extends ServiceImpl<CrawlerRecordMapper, C
// Select all rows in the tbody
Elements rows = tbody.select("tr");
for (Element row : rows) {
for (int j = 0; j < rows.size(); j++) {
if (j > 10) {
break;
}
Element row = rows.get(j);
// Select all cells in the row
CrawlerRecord crawlerRecord = new CrawlerRecord();
Elements cells = row.select("td");
if (cells.size() == headerNames.size()) { // Ensure the number of cells matches the number of headers
Map<String, String> rowMap = new HashMap<>();
for (int i = 0; i < cells.size(); i++) {
String header = headerNames.get(i);
String value = cells.get(i).text();
rowMap.put(header, value);
switch (header) {
case "Radiation Test Method":
crawlerRecord.setTestMethod(value);
break;
case "EPPL Familiy":
crawlerRecord.setCategory(value);
break;
case "EPPL Group":
crawlerRecord.setSubCategory(value);
break;
case "DUT Manufacturer":
crawlerRecord.setManufacturer(value);
break;
case "Function":
crawlerRecord.setFunctionType(value);
break;
case "Report Date":
crawlerRecord.setReportDate(value);
break;
case "Report Source":
crawlerRecord.setReportSource(value);
break;
case "Technology":
crawlerRecord.setTechnology(value);
break;
case "Id":
crawlerRecord.setReportId(value);
break;
case "DUT part type":
crawlerRecord.setCode(value);
break;
case "Radiation Test Type":
crawlerRecord.setRadiationTestType(value);
break;
}
}
// Add the map to the list
tableData.add(rowMap);
tableData.add(crawlerRecord);
}
}
// Print the list of maps
for (Map<String, String> rowMap : tableData) {
rowMap.put("fileId", rowMap.get("Id"));
rowMap.put("fileUrl", "https://esarad.esa.int/?id=" + rowMap.get("Id") + "&handler=DownloadDb");
rowMap.put("fileName", rowMap.get("Id") + ".pdf");
System.out.println(rowMap);
for (CrawlerRecord rowMap : tableData) {
rowMap.setFileUrl("https://esarad.esa.int/?id=" + rowMap.getReportId() + "&handler=DownloadDb");
}
saveFiles(tableData, esaradKey);
saveEsaradFiles(tableData, esaradKey);
} catch (Exception e) {
redisUtil.del(esaradKey);
@@ -139,20 +212,29 @@ public class CrawlerRecordServiceImpl extends ServiceImpl<CrawlerRecordMapper, C
JSONArray list = jsonObject.getJSONArray("ROWS");
System.out.println("total count " + total);
System.out.println("total list " + list.get(0));
List<Map<String, String>> tableData = new ArrayList<>();
List<CrawlerRecord> tableData = new ArrayList<>();
list.forEach(row -> {
String fileId = String.valueOf(((JSONArray) row).get(0));
String fileNames = String.valueOf(((JSONArray) row).get(4));
Map<String, String> map = new HashMap<>();
map.put("fileName", fileNames);
for (int i = 0; i < list.size(); i++) {
if (i > 10) {
break;
}
JSONArray row = (JSONArray) list.get(i);
String fileNames = String.valueOf(row.get(4));
CrawlerRecord map = new CrawlerRecord();
String fileUrls = fixFileNames(fileNames);
map.put("fileId", fileId);
map.put("fileUrl", fileUrls);
System.out.println(row);
map.setFileUrl(fileUrls);
map.setCode(String.valueOf(row.get(0)));
map.setFunctionType(String.valueOf(row.get(1)));
map.setManufacturer(String.valueOf(row.get(2)));
map.setReportDate(String.valueOf(row.get(3)));
map.setTestMethod(String.valueOf(row.get(5)));
map.setCategory(String.valueOf(row.get(6)));
map.setReportId(map.getCode().replaceAll(" ", ""));
tableData.add(map);
});
saveFiles(tableData, radhomeKey);
}
saveRadhomeFiles(tableData, radhomeKey);
}
@Override
@@ -180,29 +262,65 @@ public class CrawlerRecordServiceImpl extends ServiceImpl<CrawlerRecordMapper, C
return StringUtils.join(result, ";");
}
private void saveFiles(List<Map<String, String>> fileList, String type) {
private void saveRadhomeFiles(List<CrawlerRecord> fileList, String type) {
ThreadUtil.execute(() -> {
try {
for (Map<String, String> map : fileList) {
String fileId = map.get("fileId");
String resultStr = "";
String fileUrl = map.get("fileUrl");
for (CrawlerRecord record : fileList) {
String fileUploadResult = "";
String fileUrl = record.getFileUrl();
if (fileUrl.contains(";")) {
String[] split = fileUrl.split(";");
List<String> result = new ArrayList<>();
for (String s : split) {
byte[] fileBytes = HttpUtil.downloadBytes(s);
InputStream inputStream = new ByteArrayInputStream(fileBytes);
result.add(MinioUtil.upload(inputStream, s.substring(s.lastIndexOf("/") + 1)));
resultStr = StringUtils.join(result, ";");
result.add(MinioUtil.upload(inputStream, "radhome/" + s.substring(s.lastIndexOf("/") + 1)));
fileUploadResult = StringUtils.join(result, ";");
}
} else {
byte[] fileBytes = HttpUtil.downloadBytes(fileUrl);
InputStream inputStream = new ByteArrayInputStream(fileBytes);
resultStr = MinioUtil.upload(inputStream, fileUrl.substring(fileUrl.lastIndexOf("/") + 1));
fileUploadResult = MinioUtil.upload(inputStream, "radhome/" + fileUrl.substring(fileUrl.lastIndexOf("/") + 1));
}
System.out.println(fileUploadResult);
if (StringUtils.isNotBlank(fileUploadResult)) {
record.setFileUrl(fileUploadResult);
save(record);
}
}
redisUtil.del(type);
} catch (Exception e) {
e.printStackTrace();
}
});
}
private void saveEsaradFiles(List<CrawlerRecord> fileList, String type) {
ThreadUtil.execute(() -> {
try {
for (CrawlerRecord record : fileList) {
CrawlerRecord dbData = getOne(Wrappers.<CrawlerRecord>lambdaQuery().eq(CrawlerRecord::getReportSource, record.getReportId()));
if (Objects.nonNull(dbData)) {
continue;
}
String resultStr = "";
String dest = FileUtil.getTmpDirPath() + "esarad-" + record.getReportId() + "/";
FileUtil.mkdir(dest);
long fileSize = HttpUtil.downloadFile(record.getFileUrl(), dest);
if (fileSize > 0) {
List<File> files = FileUtil.loopFiles(dest);
for (File file : files) {
resultStr = MinioUtil.upload(IoUtil.toStream(file), "esarad/" + record.getReportId() + "-" + URLUtil.decode(file.getName(), Charset.defaultCharset()) );
}
}
System.out.println(resultStr);
if (StringUtils.isNotBlank(resultStr)) {
record.setFileUrl(resultStr);
save(record);
}
}
redisUtil.del(type);
} catch (Exception e) {