This commit is contained in:
ls
2024-12-03 00:02:44 +08:00
parent 30aaee523c
commit 215444ac4d
4 changed files with 203 additions and 180 deletions

View File

@@ -7,13 +7,13 @@ import mysql.connector
from minio import Minio
from minio.error import S3Error
minio_public_url = 'http://58.215.212.230:8005/oss/'
minio_public_url = '/oss/'
# MySQL 连接配置
db_config = {
# 'host': 'physical-mysql',
# 'port': 3306,
'host': '192.168.50.100',
'port': 23306,
'host': '127.0.0.1',
'port': 3306,
# 'host': '192.168.50.100',
# 'port': 23306,
'user': 'root',
'password': '123456',
'database': 'physical-boot'
@@ -21,8 +21,8 @@ db_config = {
# minio 配置
minio_client = Minio(
# "physical-minio:9000", # MinIO服务器地址或IP
"192.168.50.100:29000", # MinIO服务器地址或IP
"127.0.0.1:9000", # MinIO服务器地址或IP
# "192.168.50.100:29000", # MinIO服务器地址或IP
access_key="root", # 替换为你的Access Key
secret_key="12345678", # 替换为你的Secret Key
secure=False # 如果使用的是http则为False
@@ -38,7 +38,18 @@ def get_md5(input_string):
# 返回MD5值的十六进制字符串
return md5_obj.hexdigest()
def fetch_db_import_record(connection,data):
cursor = connection.cursor()
try:
"""保存数据到 MySQL 数据库"""
insert_query = """select count(1) from `nasa_data_record` where `id`= %s;"""
cursor.execute(insert_query, data)
result = cursor.fetchone()
return result[0]
except Exception as e:
print(e)
finally:
cursor.close()
def save_to_db_import_record(connection,data):
cursor = connection.cursor()
try:
@@ -81,10 +92,10 @@ def upload_to_minio(connection,folder_path,type):
object_name = f"{type}/{folder_name}/{file_name}"
try:
# 上传文件到 MinIO
url=minio_client.fput_object(bucket_name, object_name, file_path)
# url=minio_client.fput_object(bucket_name, object_name, file_path)
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
# file_id = get_md5(object_name)
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,url.object_name)})
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,object_name)})
# db_file = [file_id, file_name,
# minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
# save_to_db_oss_file(connection,db_file)

View File

@@ -7,10 +7,10 @@ import mysql.connector
import requests
from bs4 import BeautifulSoup
from common import upload_to_minio, save_to_db_import_record, db_config
from common import upload_to_minio, save_to_db_import_record, db_config, get_md5, fetch_db_import_record
# 设置下载目录
download_dir = 'downloaded_files'
download_dir = 'downloaded_files/ESA'
os.makedirs(download_dir, exist_ok=True)
# 获取网页内容
@@ -31,10 +31,15 @@ def scrape():
tables = soup.find_all('table')
# 提取第一个表格的内容
for row in tables[0].find_all('tr')[2:]: # 跳过标题行
tab_content = tables[0].find_all('tr')[2:]
tab_content.reverse()
for row in tab_content: # 跳过标题行
cells = row.find_all(['td', 'th'])
if len(cells) >= 6: # 确保有至少六列
folder_name = cells[1].get_text(strip=True) # 第2列
table_id = get_md5('ESA-' + cells[0].get_text(strip=True))
# count = fetch_db_import_record(esa_connection, (table_id,))
# if count > 0:
# continue
folder_name = table_id # cells[1].get_text(strip=True) # 第2列
file_id = cells[0].get_text(strip=True) # 第1列
download_url = f'https://esarad.esa.int/?id={file_id}&handler=DownloadDb'
@@ -57,9 +62,12 @@ def scrape():
filename = f'{file_id}.pdf'
file_path = os.path.join(folder_path, filename)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(file_response.content)
print(f'ESA Downloaded: {file_path}')
else:
print(f'file exist: {file_path}')
# 创建 CSV 文件
csv_file_path = os.path.join(folder_path, 'data.csv')
@@ -204,7 +212,7 @@ def scrape():
upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
origin_data = ','.join([c.get_text(strip=True) for c in cells])
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
cells[5].get_text(strip=True),
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
cells[8].get_text(strip=True),
@@ -215,6 +223,8 @@ def scrape():
save_to_db_import_record(esa_connection, data_db)
else:
print(f'Error: {response.status_code}')
except Exception as e:
print(e)
finally:
# 关闭游标和连接
esa_connection.close()

View File

@@ -76,7 +76,9 @@ def scrape():
print("total:" + str(json_data['RECORDS']))
for index, row in enumerate(json_data['ROWS']):
print("index:"+str(index))
part_number = row[0] # 部件编号
table_id= get_md5(''.join([row[0], row[1], row[2]]))
part_number =table_id # row[0] # 部件编号
file_links_str = row[4] # 文件链接
# 使用正则表达式分隔文件名
@@ -229,8 +231,7 @@ def scrape():
manufacturer=row[2]
experiment_date=row[3]
origin_data = ','.join(row)
data_db = [get_md5(''.join([device_name,device_function,manufacturer])), 'Crawler', datetime.now(), None, None, None,
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,

View File

@@ -6,7 +6,7 @@ import mysql.connector
import requests
from lxml import html
from common import db_config, upload_to_minio, save_to_db_import_record
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
# 定义页面 URL
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
@@ -55,7 +55,8 @@ def scrape():
columns = row.xpath('.//td')
if len(columns) < 8:
continue # 跳过不完整的行
part_number = columns[2].text_content().strip() # 第三列 Part Number
table_id= get_md5('NASA-' + columns[0].text_content().strip())
part_number =table_id # columns[2].text_content().strip() # 第三列 Part Number
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
# 获取文件名和下载链接
@@ -199,7 +200,7 @@ def scrape():
manufacturer = None
experiment_date = columns[8].text_content().strip()
origin_data = ','.join([c.text_content().strip() for c in columns])
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,