From a38c86ebeed207a2fbed1d72eda20b35f5632a7b Mon Sep 17 00:00:00 2001 From: ls Date: Tue, 15 Oct 2024 10:52:55 +0800 Subject: [PATCH] update --- .../src/main/resources/application-dev.yml | 2 +- .../physical-system-biz/pom.xml | 16 +- .../database/entity/CrawlerRecord.java | 148 ++++++++++---- .../impl/CrawlerRecordServiceImpl.java | 184 ++++++++++++++---- pom.xml | 7 +- 5 files changed, 281 insertions(+), 76 deletions(-) mode change 100755 => 100644 physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/entity/CrawlerRecord.java diff --git a/physical-launcher/src/main/resources/application-dev.yml b/physical-launcher/src/main/resources/application-dev.yml index 22e4d3c..516c3cb 100644 --- a/physical-launcher/src/main/resources/application-dev.yml +++ b/physical-launcher/src/main/resources/application-dev.yml @@ -242,7 +242,7 @@ jeecg: file-view-domain: http://fileview.jeecg.com # minio文件上传 minio: - minio_url: http://127.0.0.1:9000 + minio_url: http://192.168.50.100:29000 minio_public_url: http://58.215.212.230:8005/oss/ minio_name: root minio_pass: 12345678 diff --git a/physical-module-system/physical-system-biz/pom.xml b/physical-module-system/physical-system-biz/pom.xml index a7619d3..c5b91ca 100644 --- a/physical-module-system/physical-system-biz/pom.xml +++ b/physical-module-system/physical-system-biz/pom.xml @@ -45,11 +45,25 @@ drag-free-springboot3 1.1.2 - + + cn.hutool + hutool-core + + + cn.hutool + hutool-http + + org.jeecgframework.boot jeecg-boot-starter3-chatgpt 3.7.0 + + + cn.hutool + hutool-all + + diff --git a/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/entity/CrawlerRecord.java b/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/entity/CrawlerRecord.java old mode 100755 new mode 100644 index 9c53d92..27fff92 --- a/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/entity/CrawlerRecord.java +++ b/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/entity/CrawlerRecord.java @@ -1,77 +1,145 @@ package org.jeecg.modules.database.entity; -import java.io.Serializable; -import java.io.UnsupportedEncodingException; -import java.util.Date; -import java.math.BigDecimal; import com.baomidou.mybatisplus.annotation.IdType; import com.baomidou.mybatisplus.annotation.TableId; import com.baomidou.mybatisplus.annotation.TableName; -import com.baomidou.mybatisplus.annotation.TableLogic; -import lombok.Data; import com.fasterxml.jackson.annotation.JsonFormat; -import org.springframework.format.annotation.DateTimeFormat; -import org.jeecgframework.poi.excel.annotation.Excel; -import org.jeecg.common.aspect.annotation.Dict; import io.swagger.v3.oas.annotations.media.Schema; +import lombok.Data; import lombok.EqualsAndHashCode; import lombok.experimental.Accessors; +import org.jeecgframework.poi.excel.annotation.Excel; +import org.springframework.format.annotation.DateTimeFormat; + +import java.io.Serializable; +import java.util.Date; /** * @Description: 爬虫记录 * @Author: jeecg-boot - * @Date: 2024-09-03 + * @Date: 2024-10-14 * @Version: V1.0 */ @Data @TableName("crawler_record") @Accessors(chain = true) @EqualsAndHashCode(callSuper = false) -@Schema(description="爬虫记录") +@Schema(description = "爬虫记录") public class CrawlerRecord implements Serializable { private static final long serialVersionUID = 1L; - /**主键*/ - @TableId(type = IdType.ASSIGN_ID) + /** + * 主键 + */ + @TableId(type = IdType.ASSIGN_ID) @Schema(description = "主键") private String id; - /**创建人*/ + /** + * 创建人 + */ @Schema(description = "创建人") private String createBy; - /**创建日期*/ - @JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss") - @DateTimeFormat(pattern="yyyy-MM-dd HH:mm:ss") + /** + * 创建日期 + */ + @JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss") + @DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss") @Schema(description = "创建日期") private Date createTime; - /**更新人*/ + /** + * 更新人 + */ @Schema(description = "更新人") private String updateBy; - /**更新日期*/ - @JsonFormat(timezone = "GMT+8",pattern = "yyyy-MM-dd HH:mm:ss") - @DateTimeFormat(pattern="yyyy-MM-dd HH:mm:ss") + /** + * 更新日期 + */ + @JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm:ss") + @DateTimeFormat(pattern = "yyyy-MM-dd HH:mm:ss") @Schema(description = "更新日期") private Date updateTime; - /**所属部门*/ + /** + * 所属部门 + */ @Schema(description = "所属部门") private String sysOrgCode; - /**来源*/ - @Excel(name = "来源", width = 15) - @Schema(description = "来源") - private String source; - /**编号*/ - @Excel(name = "编号", width = 15) + /** + * 类型 + */ + @Excel(name = "类型", width = 15) + @Schema(description = "类型") + private String type; + /** + * 编号 + */ + @Excel(name = "编号", width = 15) @Schema(description = "编号") private String code; - /**名称*/ - @Excel(name = "名称", width = 15) - @Schema(description = "名称") - private String name; - /**文件名*/ - @Excel(name = "文件名", width = 15) - @Schema(description = "文件名") - private String fileName; - /**文件ID*/ - @Excel(name = "文件ID", width = 15) - @Schema(description = "文件ID") - private String fileId; + /** + * 制造商 + */ + @Excel(name = "制造商", width = 15) + @Schema(description = "制造商") + private String manufacturer; + /** + * 文件 + */ + @Excel(name = "文件", width = 15) + @Schema(description = "文件") + private String fileUrl; + /** + * 测试类型 + */ + @Excel(name = "测试类型", width = 15) + @Schema(description = "测试类型") + private String functionType; + /** + * 测试方式 + */ + @Excel(name = "测试方式", width = 15) + @Schema(description = "测试方式") + private String testMethod; + /** + * 分组 + */ + @Excel(name = "分组", width = 15) + @Schema(description = "分组") + private String category; + /** + * 二级分组 + */ + @Excel(name = "二级分组", width = 15) + @Schema(description = "二级分组") + private String subCategory; + /** + * 文档日期 + */ + @Excel(name = "文档日期", width = 15) + @Schema(description = "文档日期") + private String reportDate; + /** + * 测试技术 + */ + @Excel(name = "测试技术", width = 15) + @Schema(description = "测试技术") + private String technology; + /** + * 报告来源 + */ + @Excel(name = "报告来源", width = 15) + @Schema(description = "报告来源") + private String reportSource; + + /** + * 辐射测试类型 + */ + @Excel(name = "辐射测试类型", width = 15) + @Schema(description = "辐射测试类型") + private String radiationTestType; + /** + * 报告ID + */ + @Excel(name = "报告ID", width = 15) + @Schema(description = "报告ID") + private String reportId; } diff --git a/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/service/impl/CrawlerRecordServiceImpl.java b/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/service/impl/CrawlerRecordServiceImpl.java index f6ae6ee..2bc4f9e 100644 --- a/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/service/impl/CrawlerRecordServiceImpl.java +++ b/physical-module-system/physical-system-biz/src/main/java/org/jeecg/modules/database/service/impl/CrawlerRecordServiceImpl.java @@ -1,10 +1,21 @@ package org.jeecg.modules.database.service.impl; +import cn.hutool.core.collection.CollUtil; +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.io.IoUtil; import cn.hutool.core.thread.ThreadUtil; +import cn.hutool.core.util.ObjUtil; +import cn.hutool.core.util.ReUtil; +import cn.hutool.core.util.StrUtil; +import cn.hutool.core.util.URLUtil; +import cn.hutool.http.Header; +import cn.hutool.http.HttpRequest; +import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import com.alibaba.fastjson2.JSON; import com.alibaba.fastjson2.JSONArray; import com.alibaba.fastjson2.JSONObject; +import com.baomidou.mybatisplus.core.toolkit.Wrappers; import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl; import lombok.extern.slf4j.Slf4j; import okhttp3.*; @@ -24,9 +35,16 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import java.io.ByteArrayInputStream; +import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.*; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import static org.jeecg.modules.online.auth.b.a.f; +import static org.jeecg.modules.online.auth.b.a.i; @Slf4j @Service @@ -40,6 +58,22 @@ public class CrawlerRecordServiceImpl extends ServiceImpl dispositions = execute.headerList(Header.CONTENT_DISPOSITION.getValue()); + String fileName = null; + if (CollUtil.isNotEmpty(dispositions)) { + for (String disposition : dispositions) { + fileName = ReUtil.getGroup1("filename" + "=([^;]+)", disposition); + } + System.out.println(URLUtil.decode("N2920A%20TID_1009_01.pdf", Charset.defaultCharset())); + System.out.println(fileName); + // filename* 采用了 RFC 5987 中规定的编码方式,优先读取 + + } + + } + /** * https://esarad.esa.int/ */ @@ -51,7 +85,8 @@ public class CrawlerRecordServiceImpl extends ServiceImpl> tableData = new ArrayList<>(); + + List tableData = new ArrayList<>(); Document doc = Jsoup.connect("https://esarad.esa.int").get(); Element table = doc.getElementById("dtReports"); @@ -78,33 +113,71 @@ public class CrawlerRecordServiceImpl extends ServiceImpl 10) { + break; + } + Element row = rows.get(j); + // Select all cells in the row + CrawlerRecord crawlerRecord = new CrawlerRecord(); + Elements cells = row.select("td"); if (cells.size() == headerNames.size()) { // Ensure the number of cells matches the number of headers - Map rowMap = new HashMap<>(); for (int i = 0; i < cells.size(); i++) { + String header = headerNames.get(i); String value = cells.get(i).text(); - rowMap.put(header, value); + switch (header) { + case "Radiation Test Method": + crawlerRecord.setTestMethod(value); + break; + case "EPPL Familiy": + crawlerRecord.setCategory(value); + break; + case "EPPL Group": + crawlerRecord.setSubCategory(value); + break; + case "DUT Manufacturer": + crawlerRecord.setManufacturer(value); + break; + case "Function": + crawlerRecord.setFunctionType(value); + break; + case "Report Date": + crawlerRecord.setReportDate(value); + break; + case "Report Source": + crawlerRecord.setReportSource(value); + break; + case "Technology": + crawlerRecord.setTechnology(value); + break; + case "Id": + crawlerRecord.setReportId(value); + break; + case "DUT part type": + crawlerRecord.setCode(value); + break; + case "Radiation Test Type": + crawlerRecord.setRadiationTestType(value); + break; + } + } // Add the map to the list - tableData.add(rowMap); + tableData.add(crawlerRecord); } } // Print the list of maps - for (Map rowMap : tableData) { - rowMap.put("fileId", rowMap.get("Id")); - rowMap.put("fileUrl", "https://esarad.esa.int/?id=" + rowMap.get("Id") + "&handler=DownloadDb"); - rowMap.put("fileName", rowMap.get("Id") + ".pdf"); - - System.out.println(rowMap); + for (CrawlerRecord rowMap : tableData) { + rowMap.setFileUrl("https://esarad.esa.int/?id=" + rowMap.getReportId() + "&handler=DownloadDb"); } - saveFiles(tableData, esaradKey); + saveEsaradFiles(tableData, esaradKey); } catch (Exception e) { redisUtil.del(esaradKey); @@ -139,20 +212,29 @@ public class CrawlerRecordServiceImpl extends ServiceImpl> tableData = new ArrayList<>(); + List tableData = new ArrayList<>(); - list.forEach(row -> { - String fileId = String.valueOf(((JSONArray) row).get(0)); - String fileNames = String.valueOf(((JSONArray) row).get(4)); - Map map = new HashMap<>(); - map.put("fileName", fileNames); + for (int i = 0; i < list.size(); i++) { + + if (i > 10) { + break; + } + JSONArray row = (JSONArray) list.get(i); + String fileNames = String.valueOf(row.get(4)); + CrawlerRecord map = new CrawlerRecord(); String fileUrls = fixFileNames(fileNames); - map.put("fileId", fileId); - map.put("fileUrl", fileUrls); - System.out.println(row); + map.setFileUrl(fileUrls); + + map.setCode(String.valueOf(row.get(0))); + map.setFunctionType(String.valueOf(row.get(1))); + map.setManufacturer(String.valueOf(row.get(2))); + map.setReportDate(String.valueOf(row.get(3))); + map.setTestMethod(String.valueOf(row.get(5))); + map.setCategory(String.valueOf(row.get(6))); + map.setReportId(map.getCode().replaceAll(" ", "")); tableData.add(map); - }); - saveFiles(tableData, radhomeKey); + } + saveRadhomeFiles(tableData, radhomeKey); } @Override @@ -180,29 +262,65 @@ public class CrawlerRecordServiceImpl extends ServiceImpl> fileList, String type) { + private void saveRadhomeFiles(List fileList, String type) { ThreadUtil.execute(() -> { try { - for (Map map : fileList) { - String fileId = map.get("fileId"); - String resultStr = ""; - String fileUrl = map.get("fileUrl"); + for (CrawlerRecord record : fileList) { + String fileUploadResult = ""; + String fileUrl = record.getFileUrl(); if (fileUrl.contains(";")) { String[] split = fileUrl.split(";"); List result = new ArrayList<>(); for (String s : split) { byte[] fileBytes = HttpUtil.downloadBytes(s); InputStream inputStream = new ByteArrayInputStream(fileBytes); - result.add(MinioUtil.upload(inputStream, s.substring(s.lastIndexOf("/") + 1))); - resultStr = StringUtils.join(result, ";"); + result.add(MinioUtil.upload(inputStream, "radhome/" + s.substring(s.lastIndexOf("/") + 1))); + fileUploadResult = StringUtils.join(result, ";"); } } else { byte[] fileBytes = HttpUtil.downloadBytes(fileUrl); InputStream inputStream = new ByteArrayInputStream(fileBytes); - resultStr = MinioUtil.upload(inputStream, fileUrl.substring(fileUrl.lastIndexOf("/") + 1)); + fileUploadResult = MinioUtil.upload(inputStream, "radhome/" + fileUrl.substring(fileUrl.lastIndexOf("/") + 1)); } + System.out.println(fileUploadResult); + if (StringUtils.isNotBlank(fileUploadResult)) { + record.setFileUrl(fileUploadResult); + save(record); + } + } + redisUtil.del(type); + } catch (Exception e) { + e.printStackTrace(); + } + }); + } + + private void saveEsaradFiles(List fileList, String type) { + ThreadUtil.execute(() -> { + try { + for (CrawlerRecord record : fileList) { + CrawlerRecord dbData = getOne(Wrappers.lambdaQuery().eq(CrawlerRecord::getReportSource, record.getReportId())); + if (Objects.nonNull(dbData)) { + continue; + } + String resultStr = ""; + + String dest = FileUtil.getTmpDirPath() + "esarad-" + record.getReportId() + "/"; + FileUtil.mkdir(dest); + long fileSize = HttpUtil.downloadFile(record.getFileUrl(), dest); + + if (fileSize > 0) { + List files = FileUtil.loopFiles(dest); + for (File file : files) { + resultStr = MinioUtil.upload(IoUtil.toStream(file), "esarad/" + record.getReportId() + "-" + URLUtil.decode(file.getName(), Charset.defaultCharset()) ); + } + } + System.out.println(resultStr); + if (StringUtils.isNotBlank(resultStr)) { + record.setFileUrl(resultStr); + save(record); + } } redisUtil.del(type); } catch (Exception e) { diff --git a/pom.xml b/pom.xml index fd71c01..f50cf53 100644 --- a/pom.xml +++ b/pom.xml @@ -35,7 +35,7 @@ 11.2.0.3 4.0 8.0.27 - 5.8.25 + 5.8.32 9.0.0 8.1.1.49 @@ -343,6 +343,11 @@ hutool-crypto ${hutool.version} + + cn.hutool + hutool-http + ${hutool.version} + io.minio