scripts
This commit is contained in:
@@ -6,7 +6,7 @@ import mysql.connector
|
||||
import requests
|
||||
from lxml import html
|
||||
|
||||
from common import db_config, upload_to_minio, save_to_db_import_record
|
||||
from common import db_config, upload_to_minio, save_to_db_import_record, get_md5
|
||||
|
||||
# 定义页面 URL
|
||||
url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
|
||||
@@ -55,7 +55,8 @@ def scrape():
|
||||
columns = row.xpath('.//td')
|
||||
if len(columns) < 8:
|
||||
continue # 跳过不完整的行
|
||||
part_number = columns[2].text_content().strip() # 第三列 Part Number
|
||||
table_id= get_md5('NASA-' + columns[0].text_content().strip())
|
||||
part_number =table_id # columns[2].text_content().strip() # 第三列 Part Number
|
||||
file_link_tag = columns[7].xpath('//a') # 第八列的 <a> 标签
|
||||
|
||||
# 获取文件名和下载链接
|
||||
@@ -199,7 +200,7 @@ def scrape():
|
||||
manufacturer = None
|
||||
experiment_date = columns[8].text_content().strip()
|
||||
origin_data = ','.join([c.text_content().strip() for c in columns])
|
||||
data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
|
||||
data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
|
||||
device_type,
|
||||
device_name, device_mode,
|
||||
device_function,
|
||||
|
||||
Reference in New Issue
Block a user