This commit is contained in:
ls
2024-10-23 17:39:31 +08:00
parent 5e0bbbd757
commit e7851a09bc
4 changed files with 445 additions and 380 deletions

92
scripts/common.py Normal file
View File

@@ -0,0 +1,92 @@
import hashlib
import os
from datetime import datetime
import mysql.connector
from minio import Minio
from minio.error import S3Error
minio_public_url = 'http://58.215.212.230:8005/oss/'
# MySQL 连接配置
db_config = {
# 'host': 'physical-mysql',
# 'port': 3306,
'host': '192.168.50.100',
'port': 23306,
'user': 'root',
'password': '123456',
'database': 'physical-boot'
}
# minio 配置
minio_client = Minio(
# "physical-minio:9000", # MinIO服务器地址或IP
"192.168.50.100:29000", # MinIO服务器地址或IP
access_key="root", # 替换为你的Access Key
secret_key="12345678", # 替换为你的Secret Key
secure=False # 如果使用的是http则为False
)
bucket_name = 'physical'
def get_md5(input_string):
# 创建MD5对象
md5_obj = hashlib.md5()
# 更新对象,注意字符串需要编码为字节
md5_obj.update(input_string.encode('utf-8'))
# 返回MD5值的十六进制字符串
return md5_obj.hexdigest()
def save_to_db_import_record(connection,data):
cursor = connection.cursor()
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `import_record` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `device_type`, `device_name`, `device_mode`, `device_function`, `device_batch`, `manufacturer`, `experiment_date`, `data_source`, `experiment_user`, `total_count`, `file_list`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
print(e)
finally:
cursor.close()
def save_to_db_oss_file(connection,data):
cursor = connection.cursor()
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `oss_file` (`id`,`file_name`,`url`,`create_by`,`create_time` )
VALUES (%s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
print(e)
finally:
cursor.close()
def upload_to_minio(connection,folder_path,type):
if not minio_client.bucket_exists(bucket_name):
minio_client.make_bucket(bucket_name)
folder_name = os.path.basename(folder_path)
# 遍历文件夹中的所有文件,并上传
file_ids = []
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
# 检查是否是文件,忽略非文件类型
if os.path.isfile(file_path):
object_name = f"{type}/{folder_name}/{file_name}"
try:
# 上传文件到 MinIO
minio_client.fput_object(bucket_name, object_name, file_path)
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
file_id = get_md5(object_name)
file_ids.append(file_id)
db_file = [file_id, file_name,
minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
save_to_db_oss_file(connection,db_file)
except S3Error as err:
print(f"上传 {file_name} 时出错: {err}")
return file_ids

View File

@@ -1,5 +1,4 @@
import csv import csv
import hashlib
import os import os
import re import re
from datetime import datetime from datetime import datetime
@@ -7,102 +6,19 @@ from datetime import datetime
import mysql.connector import mysql.connector
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from minio import Minio
from minio.error import S3Error from common import upload_to_minio, save_to_db_import_record, db_config
# 设置下载目录 # 设置下载目录
download_dir = 'downloaded_files' download_dir = 'downloaded_files'
os.makedirs(download_dir, exist_ok=True) os.makedirs(download_dir, exist_ok=True)
# MySQL 连接配置
db_config = {
'host': '192.168.50.100',
'port': 23306,
'user': 'root',
'password': '123456',
'database': 'physical-boot'
}
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()
# minio 配置
minio_client = Minio(
"192.168.50.100:29000", # MinIO服务器地址或IP
access_key="root", # 替换为你的Access Key
secret_key="12345678", # 替换为你的Secret Key
secure=False # 如果使用的是http则为False
)
bucket_name = 'physical'
def get_md5(input_string):
# 创建MD5对象
md5_obj = hashlib.md5()
# 更新对象,注意字符串需要编码为字节
md5_obj.update(input_string.encode('utf-8'))
# 返回MD5值的十六进制字符串
return md5_obj.hexdigest()
def save_to_mysql(data):
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `import_record` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `device_type`, `device_name`, `device_mode`, `device_function`, `device_batch`, `manufacturer`, `experiment_date`, `data_source`, `experiment_user`, `total_count`, `file_list`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
# 处理 ZeroDivisionError 的代码
print(e)
# 获取网页内容 # 获取网页内容
url = 'https://esarad.esa.int/' url = 'https://esarad.esa.int/'
def save_to_db_file(data):
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `oss_file` (`id`,`file_name`,`url`,`create_by`,`create_time` )
VALUES (%s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
# 处理 ZeroDivisionError 的代码
print(e)
def upload_to_minio(folder_path):
if not minio_client.bucket_exists(bucket_name):
minio_client.make_bucket(bucket_name)
folder_name = os.path.basename(folder_path)
# 遍历文件夹中的所有文件,并上传
file_ids = []
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
# 检查是否是文件,忽略非文件类型
if os.path.isfile(file_path):
object_name = f"ESA/{folder_name}/{file_name}"
try:
# 上传文件到 MinIO
minio_client.fput_object(bucket_name, object_name, file_path)
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
file_id = get_md5(object_name)
file_ids.append(file_id)
db_file = [file_id, file_name,
'http://58.215.212.230:8005/oss/' + bucket_name + '/' + object_name, 'admin', datetime.now()]
save_to_db_file(db_file)
except S3Error as err:
print(f"上传 {file_name} 时出错: {err}")
return file_ids
def scrape(): def scrape():
esa_connection = mysql.connector.connect(**db_config)
try: try:
response = requests.get(url) response = requests.get(url)
@@ -286,7 +202,7 @@ def scrape():
else: else:
print(f'Failed to download: {download_url}') print(f'Failed to download: {download_url}')
upload_ids = upload_to_minio(folder_path) upload_ids = upload_to_minio(esa_connection, folder_path,'ESA')
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None, data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
cells[5].get_text(strip=True), cells[5].get_text(strip=True),
cells[1].get_text(strip=True), cells[1].get_text(strip=True), cells[1].get_text(strip=True), cells[1].get_text(strip=True),
@@ -295,10 +211,9 @@ def scrape():
cells[11].get_text(strip=True), cells[11].get_text(strip=True),
'ESA', None, None, ','.join(upload_ids) 'ESA', None, None, ','.join(upload_ids)
] ]
save_to_mysql(data_db) save_to_db_import_record(esa_connection, data_db)
else: else:
print(f'Error: {response.status_code}') print(f'Error: {response.status_code}')
finally: finally:
# 关闭游标和连接 # 关闭游标和连接
cursor.close() esa_connection.close()
connection.close()

View File

@@ -1,8 +1,13 @@
import requests
import os
import time
import re
import csv import csv
import os
import re
import time
from datetime import datetime
import mysql.connector
import requests
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
# 定义 API URL # 定义 API URL
api_url = 'https://radhome.gsfc.nasa.gov/radhome/dev/parts.cfc?method=getParts' api_url = 'https://radhome.gsfc.nasa.gov/radhome/dev/parts.cfc?method=getParts'
@@ -28,7 +33,6 @@ data = {
'sord': 'asc', 'sord': 'asc',
} }
# 创建文件夹以保存文件 # 创建文件夹以保存文件
os.makedirs('downloaded_files', exist_ok=True) os.makedirs('downloaded_files', exist_ok=True)
@@ -57,7 +61,11 @@ csv_header = [
"测试参数单位", "测试参数结果", "是否为加速试验后数据", "是否为退火数据", "测试参数单位", "测试参数结果", "是否为加速试验后数据", "是否为退火数据",
"退火温度", "退火时间", "原始数据", "数据处理方法", "其他需要说明的事项" "退火温度", "退火时间", "原始数据", "数据处理方法", "其他需要说明的事项"
] ]
def scrape(): def scrape():
nasa1_connection = mysql.connector.connect(**db_config)
try:
# 发送请求 # 发送请求
response = requests.post(api_url, headers=headers, data=data) response = requests.post(api_url, headers=headers, data=data)
response.raise_for_status() # 检查请求是否成功 response.raise_for_status() # 检查请求是否成功
@@ -73,11 +81,11 @@ def scrape():
file_links = re.split(r';|(?<=\.pdf)', file_links_str) file_links = re.split(r';|(?<=\.pdf)', file_links_str)
# 创建目录 # 创建目录
part_number_dir = os.path.join('downloaded_files', part_number) folder_path = os.path.join('downloaded_files', part_number)
os.makedirs(part_number_dir, exist_ok=True) os.makedirs(folder_path, exist_ok=True)
# 创建 CSV 文件 # 创建 CSV 文件
csv_file_path = os.path.join(part_number_dir, 'data.csv') csv_file_path = os.path.join(folder_path, 'data.csv')
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file: with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file) csv_writer = csv.writer(csv_file)
csv_writer.writerow(csv_header) # 写入表头 csv_writer.writerow(csv_header) # 写入表头
@@ -202,13 +210,34 @@ def scrape():
file_response.raise_for_status() file_response.raise_for_status()
# 保存文件 # 保存文件
file_path = os.path.join(part_number_dir, os.path.basename(file_url)) file_path = os.path.join(folder_path, os.path.basename(file_url))
with open(file_path, 'wb') as file: with open(file_path, 'wb') as file:
file.write(file_response.content) file.write(file_response.content)
print(f"NASA Download file: {file_path}") print(f"NASA Download file: {file_path}")
except requests.RequestException as e: except requests.RequestException as e:
print(f"NASA Download file error : {file_url}error: {e}") print(f"NASA Download file error : {file_url}error: {e}")
upload_ids = upload_to_minio(nasa1_connection, folder_path,'NASA')
device_type=row[6]
device_name=row[0]
device_mode=row[0]
device_function=row[1]
device_batch=None
manufacturer=row[2]
experiment_date=row[3]
data_db = [get_md5(row[0]), 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,
device_batch, manufacturer,
experiment_date,
'NASA', None, None, ','.join(upload_ids)
]
save_to_db_import_record(nasa1_connection, data_db)
finally:
# 关闭游标和连接
nasa1_connection.close()
# 输出数据 # 输出数据
# for row in json_data['ROWS']: # for row in json_data['ROWS']:
# print(row) # print(row)

View File

@@ -1,7 +1,12 @@
import csv
import os
from datetime import datetime
import mysql.connector
import requests import requests
from lxml import html from lxml import html
import os
import csv from common import db_config, upload_to_minio, save_to_db_import_record
# 定义页面 URL # 定义页面 URL
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html' url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
@@ -30,7 +35,11 @@ csv_header = [
"测试参数单位", "测试参数结果", "是否为加速试验后数据", "是否为退火数据", "测试参数单位", "测试参数结果", "是否为加速试验后数据", "是否为退火数据",
"退火温度", "退火时间", "原始数据", "数据处理方法", "其他需要说明的事项" "退火温度", "退火时间", "原始数据", "数据处理方法", "其他需要说明的事项"
] ]
def scrape(): def scrape():
nasa2_connection = mysql.connector.connect(**db_config)
try:
# 发送请求 # 发送请求
response = requests.get(url) response = requests.get(url)
response.raise_for_status() response.raise_for_status()
@@ -46,7 +55,6 @@ def scrape():
columns = row.xpath('.//td') columns = row.xpath('.//td')
if len(columns) < 8: if len(columns) < 8:
continue # 跳过不完整的行 continue # 跳过不完整的行
part_number = columns[2].text_content().strip() # 第三列 Part Number part_number = columns[2].text_content().strip() # 第三列 Part Number
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签 file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
@@ -54,11 +62,11 @@ def scrape():
file_url = 'https://radhome.gsfc.nasa.gov/' + file_link_tag[0].get('href') file_url = 'https://radhome.gsfc.nasa.gov/' + file_link_tag[0].get('href')
# 创建目录 # 创建目录
part_number_dir = os.path.join('downloaded_files', part_number) folder_path = os.path.join('downloaded_files', part_number)
os.makedirs(part_number_dir, exist_ok=True) os.makedirs(folder_path, exist_ok=True)
# 创建 CSV 文件 # 创建 CSV 文件
csv_file_path = os.path.join(part_number_dir, 'data.csv') csv_file_path = os.path.join(folder_path, 'data.csv')
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file: with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file) csv_writer = csv.writer(csv_file)
csv_writer.writerow(csv_header) # 写入表头 csv_writer.writerow(csv_header) # 写入表头
@@ -176,13 +184,34 @@ def scrape():
file_response.raise_for_status() file_response.raise_for_status()
# 保存文件 # 保存文件
file_path = os.path.join(part_number_dir, os.path.basename(file_url)) file_path = os.path.join(folder_path, os.path.basename(file_url))
with open(file_path, 'wb') as file: with open(file_path, 'wb') as file:
file.write(file_response.content) file.write(file_response.content)
print(f"NASA2 Download file: {file_path}") print(f"NASA2 Download file: {file_path}")
except requests.RequestException as e: except requests.RequestException as e:
print(f"NASA2 Download file error: {file_url}error: {e}") print(f"NASA2 Download file error: {file_url}error: {e}")
upload_ids = upload_to_minio(nasa2_connection, folder_path, 'NASA')
device_type = columns[4].text_content().strip()
device_name = columns[2].text_content().strip()
device_mode = columns[2].text_content().strip()
device_function = columns[3].text_content().strip()
device_batch = None
manufacturer = None
experiment_date = columns[8].text_content().strip()
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,
device_batch, manufacturer,
experiment_date,
'NASA', None, None, ','.join(upload_ids)
]
save_to_db_import_record(nasa2_connection, data_db)
except Exception as e:
print(f"error: {e}")
finally:
# 关闭游标和连接
nasa2_connection.close()
# 输出数据 # 输出数据
# for row in table.xpath('.//tr')[1:]: # for row in table.xpath('.//tr')[1:]:
# columns = row.xpath('.//td') # columns = row.xpath('.//td')