scripts
This commit is contained in:
@@ -7,13 +7,13 @@ import mysql.connector
|
|||||||
from minio import Minio
|
from minio import Minio
|
||||||
from minio.error import S3Error
|
from minio.error import S3Error
|
||||||
|
|
||||||
minio_public_url = 'http://58.215.212.230:8005/oss/'
|
minio_public_url = '/oss/'
|
||||||
# MySQL 连接配置
|
# MySQL 连接配置
|
||||||
db_config = {
|
db_config = {
|
||||||
# 'host': 'physical-mysql',
|
'host': '127.0.0.1',
|
||||||
# 'port': 3306,
|
'port': 3306,
|
||||||
'host': '192.168.50.100',
|
# 'host': '192.168.50.100',
|
||||||
'port': 23306,
|
# 'port': 23306,
|
||||||
'user': 'root',
|
'user': 'root',
|
||||||
'password': '123456',
|
'password': '123456',
|
||||||
'database': 'physical-boot'
|
'database': 'physical-boot'
|
||||||
@@ -21,8 +21,8 @@ db_config = {
|
|||||||
|
|
||||||
# minio 配置
|
# minio 配置
|
||||||
minio_client = Minio(
|
minio_client = Minio(
|
||||||
# "physical-minio:9000", # MinIO服务器地址或IP
|
"127.0.0.1:9000", # MinIO服务器地址或IP
|
||||||
"192.168.50.100:29000", # MinIO服务器地址或IP
|
# "192.168.50.100:29000", # MinIO服务器地址或IP
|
||||||
access_key="root", # 替换为你的Access Key
|
access_key="root", # 替换为你的Access Key
|
||||||
secret_key="12345678", # 替换为你的Secret Key
|
secret_key="12345678", # 替换为你的Secret Key
|
||||||
secure=False # 如果使用的是http则为False
|
secure=False # 如果使用的是http则为False
|
||||||
@@ -38,7 +38,18 @@ def get_md5(input_string):
|
|||||||
# 返回MD5值的十六进制字符串
|
# 返回MD5值的十六进制字符串
|
||||||
return md5_obj.hexdigest()
|
return md5_obj.hexdigest()
|
||||||
|
|
||||||
|
def fetch_db_import_record(connection,data):
|
||||||
|
cursor = connection.cursor()
|
||||||
|
try:
|
||||||
|
"""保存数据到 MySQL 数据库"""
|
||||||
|
insert_query = """select count(1) from `nasa_data_record` where `id`= %s;"""
|
||||||
|
cursor.execute(insert_query, data)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
return result[0]
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
def save_to_db_import_record(connection,data):
|
def save_to_db_import_record(connection,data):
|
||||||
cursor = connection.cursor()
|
cursor = connection.cursor()
|
||||||
try:
|
try:
|
||||||
@@ -81,10 +92,10 @@ def upload_to_minio(connection,folder_path,type):
|
|||||||
object_name = f"{type}/{folder_name}/{file_name}"
|
object_name = f"{type}/{folder_name}/{file_name}"
|
||||||
try:
|
try:
|
||||||
# 上传文件到 MinIO
|
# 上传文件到 MinIO
|
||||||
url=minio_client.fput_object(bucket_name, object_name, file_path)
|
# url=minio_client.fput_object(bucket_name, object_name, file_path)
|
||||||
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
|
print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
|
||||||
# file_id = get_md5(object_name)
|
# file_id = get_md5(object_name)
|
||||||
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,url.object_name)})
|
file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,object_name)})
|
||||||
# db_file = [file_id, file_name,
|
# db_file = [file_id, file_name,
|
||||||
# minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
|
# minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
|
||||||
# save_to_db_oss_file(connection,db_file)
|
# save_to_db_oss_file(connection,db_file)
|
||||||
|
|||||||
@@ -7,10 +7,10 @@ import mysql.connector
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from common import upload_to_minio, save_to_db_import_record, db_config
|
from common import upload_to_minio, save_to_db_import_record, db_config, get_md5, fetch_db_import_record
|
||||||
|
|
||||||
# 设置下载目录
|
# 设置下载目录
|
||||||
download_dir = 'downloaded_files'
|
download_dir = 'downloaded_files/ESA'
|
||||||
os.makedirs(download_dir, exist_ok=True)
|
os.makedirs(download_dir, exist_ok=True)
|
||||||
|
|
||||||
# 获取网页内容
|
# 获取网页内容
|
||||||
@@ -31,10 +31,15 @@ def scrape():
|
|||||||
tables = soup.find_all('table')
|
tables = soup.find_all('table')
|
||||||
|
|
||||||
# 提取第一个表格的内容
|
# 提取第一个表格的内容
|
||||||
for row in tables[0].find_all('tr')[2:]: # 跳过标题行
|
tab_content = tables[0].find_all('tr')[2:]
|
||||||
|
tab_content.reverse()
|
||||||
|
for row in tab_content: # 跳过标题行
|
||||||
cells = row.find_all(['td', 'th'])
|
cells = row.find_all(['td', 'th'])
|
||||||
if len(cells) >= 6: # 确保有至少六列
|
table_id = get_md5('ESA-' + cells[0].get_text(strip=True))
|
||||||
folder_name = cells[1].get_text(strip=True) # 第2列
|
# count = fetch_db_import_record(esa_connection, (table_id,))
|
||||||
|
# if count > 0:
|
||||||
|
# continue
|
||||||
|
folder_name = table_id # cells[1].get_text(strip=True) # 第2列
|
||||||
file_id = cells[0].get_text(strip=True) # 第1列
|
file_id = cells[0].get_text(strip=True) # 第1列
|
||||||
download_url = f'https://esarad.esa.int/?id={file_id}&handler=DownloadDb'
|
download_url = f'https://esarad.esa.int/?id={file_id}&handler=DownloadDb'
|
||||||
|
|
||||||
@@ -57,9 +62,12 @@ def scrape():
|
|||||||
filename = f'{file_id}.pdf'
|
filename = f'{file_id}.pdf'
|
||||||
|
|
||||||
file_path = os.path.join(folder_path, filename)
|
file_path = os.path.join(folder_path, filename)
|
||||||
|
if not os.path.exists(file_path):
|
||||||
with open(file_path, 'wb') as f:
|
with open(file_path, 'wb') as f:
|
||||||
f.write(file_response.content)
|
f.write(file_response.content)
|
||||||
print(f'ESA Downloaded: {file_path}')
|
print(f'ESA Downloaded: {file_path}')
|
||||||
|
else:
|
||||||
|
print(f'file exist: {file_path}')
|
||||||
|
|
||||||
# 创建 CSV 文件
|
# 创建 CSV 文件
|
||||||
csv_file_path = os.path.join(folder_path, 'data.csv')
|
csv_file_path = os.path.join(folder_path, 'data.csv')
|
||||||
@@ -204,7 +212,7 @@ def scrape():
|
|||||||
|
|
||||||
upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
|
upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
|
||||||
origin_data = ','.join([c.get_text(strip=True) for c in cells])
|
origin_data = ','.join([c.get_text(strip=True) for c in cells])
|
||||||
data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
|
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||||
cells[5].get_text(strip=True),
|
cells[5].get_text(strip=True),
|
||||||
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
|
cells[1].get_text(strip=True), cells[1].get_text(strip=True),
|
||||||
cells[8].get_text(strip=True),
|
cells[8].get_text(strip=True),
|
||||||
@@ -215,6 +223,8 @@ def scrape():
|
|||||||
save_to_db_import_record(esa_connection, data_db)
|
save_to_db_import_record(esa_connection, data_db)
|
||||||
else:
|
else:
|
||||||
print(f'Error: {response.status_code}')
|
print(f'Error: {response.status_code}')
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
finally:
|
finally:
|
||||||
# 关闭游标和连接
|
# 关闭游标和连接
|
||||||
esa_connection.close()
|
esa_connection.close()
|
||||||
|
|||||||
@@ -76,7 +76,9 @@ def scrape():
|
|||||||
print("total:" + str(json_data['RECORDS']))
|
print("total:" + str(json_data['RECORDS']))
|
||||||
for index, row in enumerate(json_data['ROWS']):
|
for index, row in enumerate(json_data['ROWS']):
|
||||||
print("index:"+str(index))
|
print("index:"+str(index))
|
||||||
part_number = row[0] # 部件编号
|
table_id= get_md5(''.join([row[0], row[1], row[2]]))
|
||||||
|
part_number =table_id # row[0] # 部件编号
|
||||||
|
|
||||||
file_links_str = row[4] # 文件链接
|
file_links_str = row[4] # 文件链接
|
||||||
|
|
||||||
# 使用正则表达式分隔文件名
|
# 使用正则表达式分隔文件名
|
||||||
@@ -229,8 +231,7 @@ def scrape():
|
|||||||
manufacturer=row[2]
|
manufacturer=row[2]
|
||||||
experiment_date=row[3]
|
experiment_date=row[3]
|
||||||
origin_data = ','.join(row)
|
origin_data = ','.join(row)
|
||||||
|
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||||
data_db = [get_md5(''.join([device_name,device_function,manufacturer])), 'Crawler', datetime.now(), None, None, None,
|
|
||||||
device_type,
|
device_type,
|
||||||
device_name, device_mode,
|
device_name, device_mode,
|
||||||
device_function,
|
device_function,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import mysql.connector
|
|||||||
import requests
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
from common import db_config, upload_to_minio, save_to_db_import_record
|
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
|
||||||
|
|
||||||
# 定义页面 URL
|
# 定义页面 URL
|
||||||
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
|
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
|
||||||
@@ -55,7 +55,8 @@ def scrape():
|
|||||||
columns = row.xpath('.//td')
|
columns = row.xpath('.//td')
|
||||||
if len(columns) < 8:
|
if len(columns) < 8:
|
||||||
continue # 跳过不完整的行
|
continue # 跳过不完整的行
|
||||||
part_number = columns[2].text_content().strip() # 第三列 Part Number
|
table_id= get_md5('NASA-' + columns[0].text_content().strip())
|
||||||
|
part_number =table_id # columns[2].text_content().strip() # 第三列 Part Number
|
||||||
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
|
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
|
||||||
|
|
||||||
# 获取文件名和下载链接
|
# 获取文件名和下载链接
|
||||||
@@ -199,7 +200,7 @@ def scrape():
|
|||||||
manufacturer = None
|
manufacturer = None
|
||||||
experiment_date = columns[8].text_content().strip()
|
experiment_date = columns[8].text_content().strip()
|
||||||
origin_data = ','.join([c.text_content().strip() for c in columns])
|
origin_data = ','.join([c.text_content().strip() for c in columns])
|
||||||
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
|
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||||
device_type,
|
device_type,
|
||||||
device_name, device_mode,
|
device_name, device_mode,
|
||||||
device_function,
|
device_function,
|
||||||
|
|||||||
Reference in New Issue
Block a user