This commit is contained in:
ls
2024-10-23 17:39:31 +08:00
parent 5e0bbbd757
commit e7851a09bc
4 changed files with 445 additions and 380 deletions

View File

@@ -1,5 +1,4 @@
import csv
import hashlib
import os
import re
from datetime import datetime
@@ -7,102 +6,19 @@ from datetime import datetime
import mysql.connector
import requests
from bs4 import BeautifulSoup
from minio import Minio
from minio.error import S3Error
from common import upload_to_minio, save_to_db_import_record, db_config
# 设置下载目录
download_dir = 'downloaded_files'
os.makedirs(download_dir, exist_ok=True)
# MySQL 连接配置
db_config = {
'host': '192.168.50.100',
'port': 23306,
'user': 'root',
'password': '123456',
'database': 'physical-boot'
}
connection = mysql.connector.connect(**db_config)
cursor = connection.cursor()
# minio 配置
minio_client = Minio(
"192.168.50.100:29000", # MinIO服务器地址或IP
access_key="root", # 替换为你的Access Key
secret_key="12345678", # 替换为你的Secret Key
secure=False # 如果使用的是http则为False
)
bucket_name = 'physical'
def get_md5(input_string):
# 创建MD5对象
md5_obj = hashlib.md5()
# 更新对象,注意字符串需要编码为字节
md5_obj.update(input_string.encode('utf-8'))
# 返回MD5值的十六进制字符串
return md5_obj.hexdigest()
def save_to_mysql(data):
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `import_record` (`id`, `create_by`, `create_time`, `update_by`, `update_time`, `sys_org_code`, `device_type`, `device_name`, `device_mode`, `device_function`, `device_batch`, `manufacturer`, `experiment_date`, `data_source`, `experiment_user`, `total_count`, `file_list`)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
# 处理 ZeroDivisionError 的代码
print(e)
# 获取网页内容
url = 'https://esarad.esa.int/'
def save_to_db_file(data):
try:
"""保存数据到 MySQL 数据库"""
insert_query = """INSERT INTO `oss_file` (`id`,`file_name`,`url`,`create_by`,`create_time` )
VALUES (%s, %s, %s, %s, %s);"""
cursor.execute(insert_query, data)
connection.commit()
except Exception as e:
# 处理 ZeroDivisionError 的代码
print(e)
def upload_to_minio(folder_path):
if not minio_client.bucket_exists(bucket_name):
minio_client.make_bucket(bucket_name)
folder_name = os.path.basename(folder_path)
# 遍历文件夹中的所有文件,并上传
file_ids = []
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
# 检查是否是文件,忽略非文件类型
if os.path.isfile(file_path):
object_name = f"ESA/{folder_name}/{file_name}"
try:
# 上传文件到 MinIO
minio_client.fput_object(bucket_name, object_name, file_path)
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
file_id = get_md5(object_name)
file_ids.append(file_id)
db_file = [file_id, file_name,
'http://58.215.212.230:8005/oss/' + bucket_name + '/' + object_name, 'admin', datetime.now()]
save_to_db_file(db_file)
except S3Error as err:
print(f"上传 {file_name} 时出错: {err}")
return file_ids
def scrape():
esa_connection = mysql.connector.connect(**db_config)
try:
response = requests.get(url)
@@ -286,7 +202,7 @@ def scrape():
else:
print(f'Failed to download: {download_url}')
upload_ids = upload_to_minio(folder_path)
upload_ids = upload_to_minio(esa_connection, folder_path,'ESA')
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
cells[5].get_text(strip=True),
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
@@ -295,10 +211,9 @@ def scrape():
cells[11].get_text(strip=True),
'ESA', None, None, ','.join(upload_ids)
]
save_to_mysql(data_db)
save_to_db_import_record(esa_connection, data_db)
else:
print(f'Error: {response.status_code}')
finally:
# 关闭游标和连接
cursor.close()
connection.close()
esa_connection.close()