This commit is contained in:
ls
2024-12-03 00:02:44 +08:00
parent 30aaee523c
commit 215444ac4d
4 changed files with 203 additions and 180 deletions

View File

@@ -6,7 +6,7 @@ import mysql.connector
import requests
from lxml import html
from common import db_config, upload_to_minio, save_to_db_import_record
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
# 定义页面 URL
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
@@ -55,7 +55,8 @@ def scrape():
columns = row.xpath('.//td')
if len(columns) < 8:
continue # 跳过不完整的行
part_number = columns[2].text_content().strip() # 第三列 Part Number
table_id= get_md5('NASA-' + columns[0].text_content().strip())
part_number =table_id # columns[2].text_content().strip() # 第三列 Part Number
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
# 获取文件名和下载链接
@@ -199,7 +200,7 @@ def scrape():
manufacturer = None
experiment_date = columns[8].text_content().strip()
origin_data = ','.join([c.text_content().strip() for c in columns])
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
device_type,
device_name, device_mode,
device_function,