scripts

2024-12-03 00:02:44 +08:00
parent 30aaee523c
commit 215444ac4d
4 changed files with 203 additions and 180 deletions
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -7,13 +7,13 @@ import mysql.connector
 from minio import Minio
 from minio.error import S3Error

-minio_public_url = 'http://58.215.212.230:8005/oss/'
+minio_public_url = '/oss/'
 # MySQL 连接配置
 db_config = {
-    # 'host': 'physical-mysql',
-    # 'port': 3306,
-    'host': '192.168.50.100',
-    'port': 23306,
+    'host': '127.0.0.1',
+    'port': 3306,
+    # 'host': '192.168.50.100',
+    # 'port': 23306,
    'user': 'root',
    'password': '123456',
    'database': 'physical-boot'
@@ -21,8 +21,8 @@ db_config = {

 # minio 配置
 minio_client = Minio(
-    # "physical-minio:9000",  # MinIO服务器地址或IP
-    "192.168.50.100:29000",  # MinIO服务器地址或IP
+    "127.0.0.1:9000",  # MinIO服务器地址或IP
+    # "192.168.50.100:29000",  # MinIO服务器地址或IP
    access_key="root",  # 替换为你的Access Key
    secret_key="12345678",  # 替换为你的Secret Key
    secure=False  # 如果使用的是http则为False
@@ -38,7 +38,18 @@ def get_md5(input_string):
    # 返回MD5值的十六进制字符串
    return md5_obj.hexdigest()

-
+def fetch_db_import_record(connection,data):
+    cursor = connection.cursor()
+    try:
+        """保存数据到 MySQL 数据库"""
+        insert_query = """select count(1) from  `nasa_data_record`  where `id`= %s;"""
+        cursor.execute(insert_query, data)
+        result = cursor.fetchone()
+        return result[0]
+    except Exception as e:
+        print(e)
+    finally:
+        cursor.close()
 def save_to_db_import_record(connection,data):
    cursor = connection.cursor()
    try:
@@ -81,10 +92,10 @@ def upload_to_minio(connection,folder_path,type):
            object_name = f"{type}/{folder_name}/{file_name}"
            try:
                # 上传文件到 MinIO
-                url=minio_client.fput_object(bucket_name, object_name, file_path)
+                # url=minio_client.fput_object(bucket_name, object_name, file_path)
                print(f"已上传: {file_path} -> {bucket_name}/{object_name}")
                # file_id = get_md5(object_name)
-                file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,url.object_name)})
+                file_ids.append({file_name:os.path.join(minio_public_url,bucket_name,object_name)})
                # db_file = [file_id, file_name,
                #            minio_public_url + bucket_name + '/' + object_name, 'admin', datetime.now()]
                # save_to_db_oss_file(connection,db_file)
--- a/scripts/esa.py
+++ b/scripts/esa.py
@@ -7,10 +7,10 @@ import mysql.connector
 import requests
 from bs4 import BeautifulSoup

-from common import upload_to_minio, save_to_db_import_record, db_config
+from common import upload_to_minio, save_to_db_import_record, db_config, get_md5, fetch_db_import_record

 # 设置下载目录
-download_dir = 'downloaded_files'
+download_dir = 'downloaded_files/ESA'
 os.makedirs(download_dir, exist_ok=True)

 # 获取网页内容
@@ -31,10 +31,15 @@ def scrape():
            tables = soup.find_all('table')

            # 提取第一个表格的内容
-            for row in tables[0].find_all('tr')[2:]:  # 跳过标题行
+            tab_content = tables[0].find_all('tr')[2:]
+            tab_content.reverse()
+            for row in tab_content:  # 跳过标题行
                cells = row.find_all(['td', 'th'])
-                if len(cells) >= 6:  # 确保有至少六列
-                    folder_name = cells[1].get_text(strip=True)  # 第2列
+                table_id = get_md5('ESA-' + cells[0].get_text(strip=True))
+                # count = fetch_db_import_record(esa_connection, (table_id,))
+                # if count > 0:
+                #     continue
+                folder_name = table_id  # cells[1].get_text(strip=True)  # 第2列
                file_id = cells[0].get_text(strip=True)  # 第1列
                download_url = f'https://esarad.esa.int/?id={file_id}&handler=DownloadDb'

@@ -57,9 +62,12 @@ def scrape():
                        filename = f'{file_id}.pdf'

                    file_path = os.path.join(folder_path, filename)
+                    if not os.path.exists(file_path):
                        with open(file_path, 'wb') as f:
                            f.write(file_response.content)
                        print(f'ESA Downloaded: {file_path}')
+                    else:
+                        print(f'file exist: {file_path}')

                    # 创建 CSV 文件
                    csv_file_path = os.path.join(folder_path, 'data.csv')
@@ -204,7 +212,7 @@ def scrape():

                upload_ids = upload_to_minio(esa_connection, folder_path, 'ESA')
                origin_data = ','.join([c.get_text(strip=True) for c in cells])
-                data_db = ['ESA-' + cells[0].get_text(strip=True), 'Crawler', datetime.now(), None, None, None,
+                data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
                           cells[5].get_text(strip=True),
                           cells[1].get_text(strip=True), cells[1].get_text(strip=True),
                           cells[8].get_text(strip=True),
@@ -215,6 +223,8 @@ def scrape():
                save_to_db_import_record(esa_connection, data_db)
        else:
            print(f'Error: {response.status_code}')
+    except Exception as e:
+        print(e)
    finally:
        # 关闭游标和连接
        esa_connection.close()
--- a/scripts/nasa1.py
+++ b/scripts/nasa1.py
@@ -76,7 +76,9 @@ def scrape():
        print("total:" + str(json_data['RECORDS']))
        for index, row in enumerate(json_data['ROWS']):
            print("index:"+str(index))
-            part_number = row[0]  # 部件编号
+            table_id= get_md5(''.join([row[0], row[1], row[2]]))
+            part_number =table_id # row[0]  # 部件编号
+
            file_links_str = row[4]  # 文件链接

            # 使用正则表达式分隔文件名
@@ -229,8 +231,7 @@ def scrape():
            manufacturer=row[2]
            experiment_date=row[3]
            origin_data = ','.join(row)
-
-            data_db = [get_md5(''.join([device_name,device_function,manufacturer])), 'Crawler', datetime.now(), None, None, None,
+            data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
                       device_type,
                       device_name, device_mode,
                       device_function,
--- a/scripts/nasa2.py
+++ b/scripts/nasa2.py
@@ -6,7 +6,7 @@ import mysql.connector
 import requests
 from lxml import html

-from common import db_config, upload_to_minio, save_to_db_import_record
+from common import db_config, upload_to_minio, save_to_db_import_record, get_md5

 # 定义页面 URL
 url = 'https://radhome.gsfc.nasa.gov/radhome/papers/TIDPart.html'
@@ -55,7 +55,8 @@ def scrape():
            columns = row.xpath('.//td')
            if len(columns) < 8:
                continue  # 跳过不完整的行
-            part_number = columns[2].text_content().strip()  # 第三列 Part Number
+            table_id= get_md5('NASA-' + columns[0].text_content().strip())
+            part_number =table_id # columns[2].text_content().strip()  # 第三列 Part Number
            file_link_tag = columns[7].xpath('//a')  # 第八列的 <a> 标签

            # 获取文件名和下载链接
@@ -199,7 +200,7 @@ def scrape():
            manufacturer = None
            experiment_date = columns[8].text_content().strip()
            origin_data = ','.join([c.text_content().strip() for c in columns])
-            data_db = ['NASA-' + columns[0].text_content().strip(), 'Crawler', datetime.now(), None, None, None,
+            data_db = [table_id, 'Crawler', datetime.now(), None, None, None,
                       device_type,
                       device_name, device_mode,
                       device_function,