From 8cd0a67b641dc57ec7debccd3490466b153548e6 Mon Sep 17 00:00:00 2001
From: oscarz <oscar@vip.qq.com>
Date: Tue, 1 Jul 2025 16:39:25 +0800
Subject: [PATCH] modify scripts

---
 iafd/src/fetch.py | 187 +++++++++++++++++++++++++++-------------------
 1 file changed, 111 insertions(+), 76 deletions(-)

diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py
index 1a027b0..bc31628 100644
--- a/iafd/src/fetch.py
+++ b/iafd/src/fetch.py
@@ -30,24 +30,33 @@ def fetch_performers_by_astro():
         url = scraper.astr_base_url + astro
         logging.info(f"Fetching data for {astro}, url {url} ...")
 
-        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
-        if soup:
-            list_data, next_url = scraper.parse_page_astro(soup, astro)
-            if list_data:
-                for row in list_data :
-                    # 写入演员数据表
-                    perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
-                    if perfomer_id:
-                        logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
-                    else:
-                        logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
+        while True:
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
+            if soup:
+                list_data, next_url = scraper.parse_page_astro(soup, astro)
+                if list_data:
+                    all_updated = True
+                    for row in list_data :
+                        # 写入演员数据表
+                        perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_astro_list=1)
+                        if perfomer_id:
+                            logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
+                        else:
+                            logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
+                            all_updated = False
+                    # 全部写成功，才算完成，进行下一页
+                    if all_updated:
+                        break
 
+                else:
+                    logging.warning(f'fetch astro error. {url} ...')
+                    time.sleep(0.5)
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+                break
             else:
                 logging.warning(f'fetch astro error. {url} ...')
-        elif status_code  and status_code == 404:
-            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
-        else:
-            logging.warning(f'fetch astro error. {url} ...')
+                time.sleep(3)
 
         # 调试添加break
         if debug:
@@ -60,23 +69,34 @@ def fetch_performers_by_birth():
         for day in range(1, 32):  # 遍历1到31天
             url = scraper.birth_base_url.format(month=month, day=day)
             logging.info(f"Fetching data for birth, url {url}")
-            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
-            if soup:
-                list_data, next_url = scraper.parse_page_birth(soup, month, day)
-                if list_data:
-                    for row in list_data :
-                        # 写入演员数据表
-                        perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
-                        if perfomer_id:
-                            logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
-                        else:
-                            logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
+
+            while True:
+                soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
+                if soup:
+                    list_data, next_url = scraper.parse_page_birth(soup, month, day)
+                    if list_data:
+                        all_updated = True
+                        for row in list_data :
+                            # 写入演员数据表
+                            perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_birth_list=1)
+                            if perfomer_id:
+                                logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
+                                break
+                            else:
+                                logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
+                                all_updated = False
+                        # 全部写成功，才算完成，进行下一页
+                        if all_updated:
+                            break                        
+                    else:
+                        logging.warning(f'fetch astro error. {url} ...')
+                        time.sleep(1)
+                elif status_code  and status_code == 404:
+                    logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+                    break
                 else:
                     logging.warning(f'fetch astro error. {url} ...')
-            elif status_code  and status_code == 404:
-                logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
-            else:
-                logging.warning(f'fetch astro error. {url} ...')
+                    time.sleep(3)
 
             # 调试添加break
             if debug:
@@ -119,16 +139,21 @@ def fetch_performers_by_ethnic():
             soup, status_code = scraper.fetch_page(next_url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), 
                         parser="lxml", preprocessor=scraper.preprocess_html)
             if soup:
-                list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
+                list_data, next_page_url = scraper.parse_page_ethnic(soup, ethnic)
                 if list_data:
+                    all_updated = True
                     for row in list_data :
                         # 写入演员数据表
                         perfomer_id = db_tools.insert_performer_index(name=row['person'], href=row.get('href', '').lower(), from_ethnic_list=1)                      
                         if perfomer_id:
                             count += 1
-                            logging.debug("'insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
+                            logging.debug(f"insert performer index to db. performer_id:{perfomer_id}, name: {row['person']}, href:{row['href']}")
                         else:
-                            logging.warning("'insert performer index failed. name: {row['person']}, href:{row['href']}")
+                            logging.warning(f"insert performer index failed. name: {row['person']}, href:{row['href']}")
+                            all_updated = False
+                    # 全部写成功，才算完成，进行下一页
+                    if all_updated:
+                        next_url = next_page_url                         
                 else:
                     logging.warning(f'fetch astro error. {next_url} ...')
             elif status_code  and status_code == 404:
@@ -136,6 +161,7 @@ def fetch_performers_by_ethnic():
                 break
             else:
                 logging.warning(f'fetch astro error. {next_url} ...')
+                time.sleep(3)
             pages +=1
 
             # 调试添加break
@@ -195,22 +221,32 @@ def fetch_movies_by_dist():
         url_list = db_tools.query_distributor_hrefs(name='vixen.com')
     for url in url_list:
         logging.info(f"Fetching data for distributor url {url} ...")
-        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
-        if soup:
-            list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
-            if list_data:                
-                for movie in list_data:
-                    tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
-                    if tmp_id:
-                        logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
-                    else:
-                        logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
-            else :
-                logging.warning(f'parse_page_movie error. url: {url}')
-        elif status_code  and status_code == 404:
-            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
-        else:
-            logging.warning(f'fetching page error. {url}')
+
+        while True:
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
+            if soup:
+                list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
+                if list_data:  
+                    all_updated = True              
+                    for movie in list_data:
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_dist_list=1)
+                        if tmp_id:
+                            logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
+                        else:
+                            logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
+                            all_updated = False
+                    # 全部写成功，才算完成，进行下一页
+                    if all_updated:
+                        break                
+                else :
+                    logging.warning(f'parse_page_movie error. url: {url}')
+                    time.sleep(1)
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+                break
+            else:
+                logging.warning(f'fetching page error. {url}')
+                time.sleep(3)
         # 调试增加brak
         if debug:
             break
@@ -225,22 +261,32 @@ def fetch_movies_by_stu():
         url_list = db_tools.query_studio_hrefs(name='vixen.com')
     for url in url_list:
         logging.info(f"Fetching data for studio url {url} ...")
-        soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
-        if soup:
-            list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
-            if list_data:                
-                for movie in list_data:
-                    tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
-                    if tmp_id:
-                        logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
-                    else:
-                        logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
-            else :
-                logging.warning(f'parse_page_movie error. url: {url}')
-        elif status_code  and status_code == 404:
-            logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
-        else:
-            logging.warning(f'fetching page error. {url}')
+
+        while True:
+            soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
+            if soup:
+                list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
+                if list_data:               
+                    all_updated = True     
+                    for movie in list_data:
+                        tmp_id = db_tools.insert_movie_index(title=movie['title'], href=movie['href'], release_year=utils.to_number(movie['year']), from_stu_list=1)
+                        if tmp_id:
+                            logging.debug(f"insert one movie index to db. movie_id: {tmp_id}, title: {movie['title']}, href: {movie['href']}")
+                        else:
+                            logging.warning(f"insert movie index failed. title: {movie['title']}, href: {movie['href']}")
+                            all_updated = False
+                    # 全部写成功，才算完成，进行下一页
+                    if all_updated:
+                        break                
+                else :
+                    logging.warning(f'parse_page_movie error. url: {url}')
+                    time.sleep(1)
+            elif status_code  and status_code == 404:
+                logging.warning(f'fetch page error. httpcode: {status_code}, url: {url}, Skiping...')
+                break
+            else:
+                logging.warning(f'fetching page error. {url}')
+                time.sleep(3)
         # 调试增加brak
         if debug:
             break
@@ -325,17 +371,6 @@ def fetch_performers_detail():
         if debug:
             break
 
-    # 获取待更新的演员的列表，这个对账目前做的还有点问题
-    while False:
-        perfomers_list = db_tools.get_performers_needed_update(limit=limit_count)
-        if len(perfomers_list) < 1:
-            logging.info(f'all existed performers updated. ')
-            break
-        last_perfomer_id = fetch_performers_detail_once(perfomers_list)
-        logging.info(f'insert {len(perfomers_list)} person. last performer id: {last_perfomer_id}')
-        if debug:
-            break
-
 # 更新影片信息
 def fetch_movies_detail():
     limit_count = 10 if debug else 100