scripts
This commit is contained in:
@@ -7,13 +7,13 @@ import mysql.connector
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
|
||||
minio_public_url = 'http://58.215.212.230:8005/oss/'
|
||||
minio_public_url = '/oss/'
|
||||
# MySQL 连接配置
|
||||
db_config = {
|
||||
# 'host': 'physical-mysql',
|
||||
# 'port': 3306,
|
||||
'host': '192.168.50.100',
|
||||
'port': 23306,
|
||||
'host': '127.0.0.1',
|
||||
'port': 3306,
|
||||
# 'host': '192.168.50.100',
|
||||
# 'port': 23306,
|
||||
'user': 'root',
|
||||
'password': '123456',
|
||||
'database': 'physical-boot'
|
||||
@@ -21,8 +21,8 @@ db_config = {
|
||||
|
||||
# minio 配置
|
||||
minio_client = Minio(
|
||||
# "physical-minio:9000", # MinIO服务器地址或IP
|
||||
"192.168.50.100:29000", # MinIO服务器地址或IP
|
||||
"127.0.0.1:9000", # MinIO服务器地址或IP
|
||||
# "192.168.50.100:29000", # MinIO服务器地址或IP
|
||||
access_key="root", # 替换为你的Access Key
|
||||
secret_key="12345678", # 替换为你的Secret Key
|
||||
secure=False # 如果使用的是http则为False
|
||||
@@ -38,7 +38,18 @@ def get_md5(input_string):
|
||||
# 返回MD5值的十六进制字符串
|
||||
return md5_obj.hexdigest()
|
||||
|
||||
|
||||
def fetch_db_import_record(connection,data):
|
||||
cursor = connection.cursor()
|
||||
try:
|
||||
"""保存数据到 MySQL 数据库"""
|
||||
insert_query = """select count(1) from `nasa_data_record` where `id`= %s;"""
|
||||
cursor.execute(insert_query, data)
|
||||
result = cursor.fetchone()
|
||||
return result[0]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
finally:
|
||||
cursor.close()
|
||||
def save_to_db_import_record(connection,data):
|
||||
cursor = connection.cursor()
|
||||
try:
|
||||
@@ -81,10 +92,10 @@ def upload_to_minio(connection,folder_path,type):
|
||||
object_name = f"{type}/{folder_name}/{file_name}"
|
||||
try:
|
||||
# 上传文件到 MinIO
|
||||
url=minio_client.fput_object(bucket_name, object_name, file_path)
|
||||
# url=minio_client.fput_object(bucket_name, object_name, file_path)
|
||||
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
|
||||
# file_id = get_md5(object_name)
|
||||
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,url.object_name)})
|
||||
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,object_name)})
|
||||
# db_file = [file_id, file_name,
|
||||
# minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
|
||||
# save_to_db_oss_file(connection,db_file)
|
||||
|
||||
@@ -7,10 +7,10 @@ import mysql.connector
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from common import upload_to_minio, save_to_db_import_record, db_config
|
||||
from common import upload_to_minio, save_to_db_import_record, db_config, get_md5, fetch_db_import_record
|
||||
|
||||
# 设置下载目录
|
||||
download_dir = 'downloaded_files'
|
||||
download_dir = 'downloaded_files/ESA'
|
||||
os.makedirs(download_dir, exist_ok=True)
|
||||
|
||||
# 获取网页内容
|
||||
@@ -31,10 +31,15 @@ def scrape():
|
||||
tables = soup.find_all('table')
|
||||
|
||||
# 提取第一个表格的内容
|
||||
for row in tables[0].find_all('tr')[2:]: # 跳过标题行
|
||||
tab_content = tables[0].find_all('tr')[2:]
|
||||
tab_content.reverse()
|
||||
for row in tab_content: # 跳过标题行
|
||||
cells = row.find_all(['td', 'th'])
|
||||
if len(cells) >= 6: # 确保有至少六列
|
||||
folder_name = cells[1].get_text(strip=True) # 第2列
|
||||
table_id = get_md5('ESA-' + cells[0].get_text(strip=True))
|
||||
# count = fetch_db_import_record(esa_connection, (table_id,))
|
||||
# if count > 0:
|
||||
# continue
|
||||
folder_name = table_id # cells[1].get_text(strip=True) # 第2列
|
||||
file_id = cells[0].get_text(strip=True) # 第1列
|
||||
download_url = f'https://esarad.esa.int/?id={file_id}&handler=DownloadDb'
|
||||
|
||||
@@ -57,9 +62,12 @@ def scrape():
|
||||
filename = f'{file_id}.pdf'
|
||||
|
||||
file_path = os.path.join(folder_path, filename)
|
||||
if not os.path.exists(file_path):
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(file_response.content)
|
||||
print(f'ESA Downloaded: {file_path}')
|
||||
else:
|
||||
print(f'file exist: {file_path}')
|
||||
|
||||
# 创建 CSV 文件
|
||||
csv_file_path = os.path.join(folder_path, 'data.csv')
|
||||
@@ -204,7 +212,7 @@ def scrape():
|
||||
|
||||
upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
|
||||
origin_data = ','.join([c.get_text(strip=True) for c in cells])
|
||||
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
|
||||
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||
cells[5].get_text(strip=True),
|
||||
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
|
||||
cells[8].get_text(strip=True),
|
||||
@@ -215,6 +223,8 @@ def scrape():
|
||||
save_to_db_import_record(esa_connection, data_db)
|
||||
else:
|
||||
print(f'Error: {response.status_code}')
|
||||
except Exception as e:
|
||||
print(e)
|
||||
finally:
|
||||
# 关闭游标和连接
|
||||
esa_connection.close()
|
||||
|
||||
@@ -76,7 +76,9 @@ def scrape():
|
||||
print("total:" + str(json_data['RECORDS']))
|
||||
for index, row in enumerate(json_data['ROWS']):
|
||||
print("index:"+str(index))
|
||||
part_number = row[0] # 部件编号
|
||||
table_id= get_md5(''.join([row[0], row[1], row[2]]))
|
||||
part_number =table_id # row[0] # 部件编号
|
||||
|
||||
file_links_str = row[4] # 文件链接
|
||||
|
||||
# 使用正则表达式分隔文件名
|
||||
@@ -229,8 +231,7 @@ def scrape():
|
||||
manufacturer=row[2]
|
||||
experiment_date=row[3]
|
||||
origin_data = ','.join(row)
|
||||
|
||||
data_db = [get_md5(''.join([device_name,device_function,manufacturer])), 'Crawler', datetime.now(), None, None, None,
|
||||
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||
device_type,
|
||||
device_name, device_mode,
|
||||
device_function,
|
||||
|
||||
@@ -6,7 +6,7 @@ import mysql.connector
|
||||
import requests
|
||||
from lxml import html
|
||||
|
||||
from common import db_config, upload_to_minio, save_to_db_import_record
|
||||
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
|
||||
|
||||
# 定义页面 URL
|
||||
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
|
||||
@@ -55,7 +55,8 @@ def scrape():
|
||||
columns = row.xpath('.//td')
|
||||
if len(columns) < 8:
|
||||
continue # 跳过不完整的行
|
||||
part_number = columns[2].text_content().strip() # 第三列 Part Number
|
||||
table_id= get_md5('NASA-' + columns[0].text_content().strip())
|
||||
part_number =table_id # columns[2].text_content().strip() # 第三列 Part Number
|
||||
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
|
||||
|
||||
# 获取文件名和下载链接
|
||||
@@ -199,7 +200,7 @@ def scrape():
|
||||
manufacturer = None
|
||||
experiment_date = columns[8].text_content().strip()
|
||||
origin_data = ','.join([c.text_content().strip() for c in columns])
|
||||
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
|
||||
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||
device_type,
|
||||
device_name, device_mode,
|
||||
device_function,
|
||||
|
||||
Reference in New Issue
Block a user