This commit is contained in:
ls
2024-11-12 15:46:28 +08:00
parent d02773609f
commit 835218ffa6
8 changed files with 161 additions and 133 deletions

View File

@@ -1,6 +1,7 @@
import hashlib
import os
from datetime import datetime
from datetime import timedelta
import mysql.connector
from minio import Minio
@@ -9,10 +10,10 @@ from minio.error import S3Error
minio_public_url = 'http://58.215.212.230:8005/oss/'
# MySQL 连接配置
db_config = {
'host': 'physical-mysql',
'port': 3306,
# 'host': '192.168.50.100',
# 'port': 23306,
# 'host': 'physical-mysql',
# 'port': 3306,
'host': '192.168.50.100',
'port': 23306,
'user': 'root',
'password': '123456',
'database': 'physical-boot'
@@ -20,8 +21,8 @@ db_config = {
# minio 配置
minio_client = Minio(
"physical-minio:9000", # MinIO服务器地址或IP
# "192.168.50.100:29000", # MinIO服务器地址或IP
# "physical-minio:9000", # MinIO服务器地址或IP
"192.168.50.100:29000", # MinIO服务器地址或IP
access_key="root", # 替换为你的Access Key
secret_key="12345678", # 替换为你的Secret Key
secure=False # 如果使用的是http则为False
@@ -42,8 +43,8 @@ def save_to_db_import_record(connection,data):
cursor = connection.cursor()
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `import_record` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `device_type`, `device_name`, `device_mode`, `device_function`, `device_batch`, `manufacturer`, `experiment_date`, `data_source`, `experiment_user`, `total_count`, `file_list`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
insert_query = """INSERT INTO `nasa_data_record` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `device_type`, `device_name`, `device_mode`, `device_function`, `device_batch`, `manufacturer`, `experiment_date`, `data_source`, `experiment_user`, `total_count`, `file_list`,`origin_data`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
@@ -80,13 +81,17 @@ def upload_to_minio(connection,folder_path,type):
object_name = f"{type}/{folder_name}/{file_name}"
try:
# 上传文件到 MinIO
minio_client.fput_object(bucket_name, object_name, file_path)
url=minio_client.fput_object(bucket_name, object_name, file_path)
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
file_id = get_md5(object_name)
file_ids.append(file_id)
db_file = [file_id, file_name,
minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
save_to_db_oss_file(connection,db_file)
# file_id = get_md5(object_name)
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,url.object_name)})
# db_file = [file_id, file_name,
# minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
# save_to_db_oss_file(connection,db_file)
# expires = timedelta(days=1)
# presigned_url = minio_client.presigned_get_object(bucket_name, object_name, expires=expires)
# print(f"Presigned URL for {object_name}: {presigned_url}")
except S3Error as err:
print(f"上传 {file_name} 时出错: {err}")
return file_ids

View File

@@ -44,7 +44,7 @@ def scrape():
# 下载文件并获取文件名
file_response = requests.get(download_url)
if file_response.status_code == 200:
if file_response and file_response.status_code == 200:
# 从响应头获取文件名
content_disposition = file_response.headers.get('Content-Disposition')
filename = ''
@@ -202,16 +202,17 @@ def scrape():
else:
print(f'Failed to download: {download_url}')
upload_ids = upload_to_minio(esa_connection, folder_path,'ESA')
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
cells[5].get_text(strip=True),
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
cells[8].get_text(strip=True),
cells[7].get_text(strip=True), cells[2].get_text(strip=True),
cells[11].get_text(strip=True),
'ESA', None, None, ','.join(upload_ids)
]
save_to_db_import_record(esa_connection, data_db)
upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
origin_data = ','.join([c.get_text(strip=True) for c in cells])
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
cells[5].get_text(strip=True),
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
cells[8].get_text(strip=True),
cells[7].get_text(strip=True), cells[2].get_text(strip=True),
cells[11].get_text(strip=True),
'ESA', None, None, str(upload_ids), origin_data
]
save_to_db_import_record(esa_connection, data_db)
else:
print(f'Error: {response.status_code}')
finally:

View File

@@ -73,7 +73,9 @@ def scrape():
# 解析 JSON 数据
json_data = response.json()
# 遍历数据并下载文件
for row in json_data['ROWS']:
print("total:" + str(json_data['RECORDS']))
for index, row in enumerate(json_data['ROWS']):
print("index:"+str(index))
part_number = row[0] # 部件编号
file_links_str = row[4] # 文件链接
@@ -209,7 +211,7 @@ def scrape():
file_response = requests.get(file_url)
file_response.raise_for_status()
# 保存文件
# 保存文件1685+431+314
file_path = os.path.join(folder_path, os.path.basename(file_url))
with open(file_path, 'wb') as file:
file.write(file_response.content)
@@ -226,15 +228,19 @@ def scrape():
device_batch=None
manufacturer=row[2]
experiment_date=row[3]
data_db = [get_md5(row[0]), 'Crawler', datetime.now(), None, None, None,
origin_data = ','.join(row)
data_db = [get_md5(''.join([device_name,device_function,manufacturer])), 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,
device_batch, manufacturer,
experiment_date,
'NASA', None, None, ','.join(upload_ids)
'NASA', None, None, str(upload_ids),origin_data
]
save_to_db_import_record(nasa1_connection, data_db)
except Exception as e:
print(e)
finally:
# 关闭游标和连接
nasa1_connection.close()

View File

@@ -198,13 +198,14 @@ def scrape():
device_batch = None
manufacturer = None
experiment_date = columns[8].text_content().strip()
origin_data = ','.join([c.text_content().strip() for c in columns])
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,
device_batch, manufacturer,
experiment_date,
'NASA', None, None, ','.join(upload_ids)
'NASA', None, None, str(upload_ids),origin_data
]
save_to_db_import_record(nasa2_connection, data_db)
except Exception as e: