254 lines
10 KiB
Python
254 lines
10 KiB
Python
import json
|
||
import os
|
||
import re
|
||
import time
|
||
from datetime import datetime
|
||
|
||
import mysql.connector
|
||
import requests
|
||
import pandas as pd
|
||
|
||
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
|
||
|
||
# https://nepp.nasa.gov/radhome/dev/parts.cfc?method=getParts
|
||
# https://nepp.nasa.gov/radhome/raddatabase/raddatabase.html
|
||
# 定义 API URL
|
||
api_url = 'https://nepp.nasa.gov/radhome/dev/parts.cfc?method=getParts'
|
||
|
||
# 设置请求头
|
||
headers = {
|
||
'accept': 'application/json, text/javascript, */*; q=0.01',
|
||
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||
'user-agent': 'Mozilla/5.0',
|
||
'x-requested-with': 'XMLHttpRequest',
|
||
}
|
||
|
||
# 获取当前时间戳
|
||
current_timestamp = str(int(time.time() * 1000))
|
||
|
||
# 设置请求数据
|
||
data = {
|
||
'_search': 'false',
|
||
'nd': current_timestamp,
|
||
'rows': '10000',
|
||
'page': '1',
|
||
'sidx': 'partnumber',
|
||
'sord': 'asc',
|
||
}
|
||
|
||
# 创建文件夹以保存文件
|
||
os.makedirs('downloaded_files', exist_ok=True)
|
||
|
||
# 文件前缀
|
||
file_prefix = 'https://nepp.nasa.gov/radhome/papers/'
|
||
|
||
# CSV 表头
|
||
csv_header = [
|
||
"序号", "试验对象类型", "试验开始日期", "试验结束日期", "试验对象名称", "试验对象型号",
|
||
"试验对象数量", "试验性质", "试验目的", "装置名称", "数据提供单位", "试验委托单位",
|
||
"失效判据", "失效数量", "试验结果描述", "成果", "来源项目名称", "来源项目类型",
|
||
"分类", "元器件名称", "元器件型号", "元器件批号", "生产单位", "是否国产",
|
||
"元器件成熟度", "晶圆材料", "晶圆批号", "封装材料", "封装技术", "是否倒装",
|
||
"制造工艺", "工艺特征尺寸", "工艺平台", "工艺代号", "工艺版本", "质量等级",
|
||
"加固措施", "工作原理", "供货能力", "应用经历", "规范手册", "器件图片",
|
||
"电子系统分类", "电子系统名称", "电子系统型号", "生产单位", "电子系统功能",
|
||
"电子系统加固措施", "电子系统图片", "材料名称", "材料型号", "材料组分",
|
||
"材料用途", "材料生产单位", "材料物理结构", "材料使用经历", "辐照试验大纲",
|
||
"大纲审核专家类别", "辐照试验所依据的标准规范", "试验步骤(过程)描述", "辐照过程是否加电",
|
||
"直流偏置条件描述", "交流偏置条件描述", "时钟频率", "测试图形", "其他偏置条件",
|
||
"辐照偏置原理图", "测试方式", "测试原理图", "试验用仪器名称", "试验用仪器型号",
|
||
"试验用仪器生产厂家", "试验用仪器检定证书", "试验用软件名称", "试验用软件开发单位",
|
||
"试验用软件版本号", "试验现场照片", "测试人员姓名", "测试人员单位", "测试人员电话",
|
||
"装置运行人员", "第三方人员", "第三方人员单位", "第三方人员电话", "其他需要说明的事项",
|
||
"是否采用铅铝屏蔽", "剂量率", "总剂量", "剂量等效材料", "试验对象编号", "测试参数名称",
|
||
"测试参数单位", "测试参数结果", "是否为加速试验后数据", "是否为退火数据",
|
||
"退火温度", "退火时间", "原始数据", "数据处理方法", "其他需要说明的事项"
|
||
]
|
||
|
||
|
||
def scrape():
|
||
nasa1_connection = mysql.connector.connect(**db_config)
|
||
try:
|
||
# 发送请求
|
||
response = requests.post(api_url, headers=headers, data=data)
|
||
response.raise_for_status() # 检查请求是否成功
|
||
|
||
# 解析 JSON 数据
|
||
json_data = response.json()
|
||
# 遍历数据并下载文件
|
||
print("total:" + str(json_data['RECORDS']))
|
||
for index, row in enumerate(json_data['ROWS']):
|
||
print("index:"+str(index))
|
||
table_id= get_md5(''.join([row[0], row[1], row[2]]))
|
||
part_number =table_id # row[0] # 部件编号
|
||
|
||
file_links_str = row[4] # 文件链接
|
||
|
||
# 使用正则表达式分隔文件名
|
||
file_links = re.split(r';|(?<=\.pdf)', file_links_str)
|
||
|
||
# 创建目录
|
||
folder_path = os.path.join('downloaded_files', part_number)
|
||
os.makedirs(folder_path, exist_ok=True)
|
||
|
||
# 创建 DataFrame 并保存为 Excel 文件
|
||
xlsx_file_path = os.path.join(folder_path, 'data.xlsx')
|
||
|
||
# 填写数据
|
||
data_row = [
|
||
"", # 序号
|
||
"", # 试验对象类型
|
||
row[3], # 试验开始日期
|
||
"", # 试验结束日期
|
||
"", # 试验对象名称
|
||
row[1], # 试验对象型号
|
||
"", # 试验对象数量
|
||
row[6], # 试验性质
|
||
"", # 试验目的
|
||
"", # 装置名称
|
||
"", # 数据提供单位
|
||
"", # 试验委托单位
|
||
"", # 失效判据
|
||
"", # 失效数量
|
||
"", # 试验结果描述
|
||
"", # 成果
|
||
"", # 来源项目名称
|
||
"", # 来源项目类型
|
||
"", # 分类
|
||
"", # 元器件名称
|
||
"", # 元器件型号
|
||
"", # 元器件批号
|
||
row[2], # 生产单位
|
||
"", # 是否国产
|
||
"", # 元器件成熟度
|
||
"", # 晶圆材料
|
||
"", # 晶圆批号
|
||
"", # 封装材料
|
||
"", # 封装技术
|
||
"", # 是否倒装
|
||
"", # 制造工艺
|
||
"", # 工艺特征尺寸
|
||
"", # 工艺平台
|
||
"", # 工艺代号
|
||
"", # 工艺版本
|
||
"", # 质量等级
|
||
"", # 加固措施
|
||
"", # 工作原理
|
||
"", # 供货能力
|
||
"", # 应用经历
|
||
"", # 规范手册
|
||
"", # 器件图片
|
||
"", # 电子系统分类
|
||
"", # 电子系统名称
|
||
"", # 电子系统型号
|
||
"", # 生产单位
|
||
"", # 电子系统功能
|
||
"", # 电子系统加固措施
|
||
"", # 电子系统图片
|
||
"", # 材料名称
|
||
"", # 材料型号
|
||
"", # 材料组分
|
||
"", # 材料用途
|
||
"", # 材料生产单位
|
||
"", # 材料物理结构
|
||
"", # 材料使用经历
|
||
"", # 辐照试验大纲
|
||
"", # 大纲审核专家类别
|
||
"", # 辐照试验所依据的标准规范
|
||
"", # 试验步骤(过程)描述
|
||
"", # 辐照过程是否加电
|
||
"", # 直流偏置条件描述
|
||
"", # 交流偏置条件描述
|
||
"", # 时钟频率
|
||
"", # 测试图形
|
||
"", # 其他偏置条件
|
||
"", # 辐照偏置原理图
|
||
"", # 测试方式
|
||
"", # 测试原理图
|
||
"", # 试验用仪器名称
|
||
"", # 试验用仪器型号
|
||
"", # 试验用仪器生产厂家
|
||
"", # 试验用仪器检定证书
|
||
"", # 试验用软件名称
|
||
"", # 试验用软件开发单位
|
||
"", # 试验用软件版本号
|
||
"", # 试验现场照片
|
||
"", # 测试人员姓名
|
||
"", # 测试人员单位
|
||
"", # 测试人员电话
|
||
"", # 装置运行人员
|
||
"", # 第三方人员
|
||
"", # 第三方人员单位
|
||
"", # 第三方人员电话
|
||
"", # 其他需要说明的事项
|
||
"", # 是否采用铅铝屏蔽
|
||
"", # 剂量率
|
||
"", # 总剂量
|
||
"", # 剂量等效材料
|
||
"", # 试验对象编号
|
||
"", # 测试参数名称
|
||
"", # 测试参数单位
|
||
"", # 测试参数结果
|
||
"", # 是否为加速试验后数据
|
||
"", # 是否为退火数据
|
||
"", # 退火温度
|
||
"", # 退火时间
|
||
"", # 原始数据
|
||
"", # 数据处理方法
|
||
"", # 其他需要说明的事项
|
||
]
|
||
|
||
# 创建 DataFrame
|
||
df = pd.DataFrame([data_row], columns=csv_header)
|
||
df.to_excel(xlsx_file_path, index=False)
|
||
|
||
# 已直接创建了 Excel 文件,不需要再转换
|
||
# 下载文件
|
||
for file_name in file_links:
|
||
file_name = file_name.strip() # 去除空格
|
||
if file_name:
|
||
# 拼接文件完整 URL
|
||
file_url = file_name if file_name.startswith('http') else file_prefix + file_name
|
||
|
||
try:
|
||
# 下载文件
|
||
file_response = requests.get(file_url)
|
||
file_response.raise_for_status()
|
||
|
||
# 保存文件1685+431+314
|
||
file_path = os.path.join(folder_path, os.path.basename(file_url))
|
||
with open(file_path, 'wb') as file:
|
||
file.write(file_response.content)
|
||
print(f"NASA Download file: {file_path}")
|
||
|
||
except requests.RequestException as e:
|
||
print(f"NASA Download file error : {file_url},error: {e}")
|
||
|
||
upload_ids = upload_to_minio(nasa1_connection, folder_path,'NASA')
|
||
device_type=row[6]
|
||
device_name=row[0]
|
||
device_mode=row[0]
|
||
device_function=row[1]
|
||
device_batch=None
|
||
manufacturer=row[2]
|
||
experiment_date=row[3]
|
||
origin_data = ','.join(row)
|
||
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||
device_type,
|
||
device_name, device_mode,
|
||
device_function,
|
||
device_batch, manufacturer,
|
||
experiment_date,
|
||
'NASA', None, None, str(upload_ids),origin_data
|
||
]
|
||
save_to_db_import_record(nasa1_connection, data_db)
|
||
except Exception as e:
|
||
print(e)
|
||
finally:
|
||
# 关闭游标和连接
|
||
nasa1_connection.close()
|
||
# 输出数据
|
||
# for row in json_data['ROWS']:
|
||
# print(row)
|
||
|
||
scrape() |