From 434c7459204e06cc3aebce834e1942e77e9484c8 Mon Sep 17 00:00:00 2001
From: oscarz <oscar@vip.qq.com>
Date: Thu, 24 Apr 2025 17:24:13 +0800
Subject: [PATCH] modify scripts

---
 iafd/src/fetch.py        | 25 ++++++++--------
 iafd/src/iafd_scraper.py | 13 ++++++---
 javdb/src/fetch.py       | 61 ++++++++++++++++++++++++++++++----------
 javdb/src/scraper.py     | 16 +++++++----
 4 files changed, 78 insertions(+), 37 deletions(-)

diff --git a/iafd/src/fetch.py b/iafd/src/fetch.py
index 001bab6..c8a4249 100644
--- a/iafd/src/fetch.py
+++ b/iafd/src/fetch.py
@@ -247,7 +247,7 @@ def fetch_performers_detail_once(perfomers_list):
         logging.debug(f"Fetching data for performer ({person}), url {url} ...")
         soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
         # 从本地读取的文件，忽略
-        if skip_local and status_code == 99 :
+        if skip_local and status_code == scraper.http_code_local :
             last_performer_id = curr_id
             continue
         if soup:
@@ -272,11 +272,11 @@ def fetch_performers_detail_once(perfomers_list):
                 })
             else:
                 logging.warning(f'parse_page_performer error. person: ({person}), url: {url}')
-        elif status_code  and status_code == 404:
-            performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=2)
+        elif status_code  and status_code == scraper.http_code_404:
+            performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_404)
             logging.warning(f'404 page. id: {performer_id}, name: {person}, url: {url}, Skiping...')
-        elif status_code  and status_code == 601:
-            performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=3)
+        elif status_code  and status_code == scraper.http_code_url:
+            performer_id = db_tools.insert_or_update_performer_404(name=person, href=url, is_full_data=scraper.http_code_url)
             logging.warning(f'601 page(wrong url). id: {performer_id}, name: {person}, url: {url}, Skiping...')
         else:
             logging.warning(f'fetch_page error. person: ({person}), url: {url}')
@@ -293,7 +293,7 @@ def fetch_performers_detail():
     # 获取新演员的列表
     while True:        
         if force:   # 从头逐个遍历
-            perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
+            perfomers_list = db_tools.query_performer_hrefs(start_id=last_perfomer_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
         else:       # 只做更新
             perfomers_list = db_tools.query_performer_hrefs(is_full_data=0, limit=limit_count)
         if len(perfomers_list) < 1:
@@ -322,7 +322,7 @@ def fetch_movies_detail():
     last_movie_id = 0
     while True:
         if force:   # 从头逐个遍历
-            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count)
+            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[scraper.http_code_404, scraper.http_code_url], order_by='id asc', limit=limit_count)
         else:       # 只做更新
             movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
         if len(movies_list) < 1:
@@ -336,8 +336,9 @@ def fetch_movies_detail():
             logging.debug(f"Fetching data for movie: {curr_id}: ({title}), url {url} ...")
             soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
             # 从本地读取的文件，忽略
-            if skip_local and status_code == 99 :
+            if skip_local and status_code == scraper.http_code_local :
                 last_movie_id = curr_id
+                succ_count += 1
                 continue
             if soup:
                 movie_data = scraper.parse_page_movie(soup, url, title)
@@ -359,13 +360,13 @@ def fetch_movies_detail():
                     utils.write_movie_json(url, movie_data)
                 else:
                     logging.warning(f'parse_page_movie error. url: {url}')
-            elif status_code  and status_code == 404:
+            elif status_code  and status_code == scraper.http_code_404:
                 # 标记为已处理
-                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
+                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
                 logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
-            elif status_code  and status_code == 601:
+            elif status_code  and status_code == scraper.http_code_url:
                 # 标记为已处理
-                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
+                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_url)
                 logging.warning(f'601 page(wrong url). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
             else:
                 logging.warning(f'fetch_page error. url: {url}')
diff --git a/iafd/src/iafd_scraper.py b/iafd/src/iafd_scraper.py
index ef7a778..1ba861b 100644
--- a/iafd/src/iafd_scraper.py
+++ b/iafd/src/iafd_scraper.py
@@ -36,6 +36,11 @@ headers = {
 }
 scraper = cloudscraper.create_scraper()
 
+http_code_404   = 404
+http_code_login = 401
+http_code_url = 601
+http_code_local = 99
+
 save_raw_html = True
 load_from_local = True
 
@@ -49,27 +54,27 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
 
             soup = BeautifulSoup(html_text, parser)
             if validator(soup):  # 进行自定义页面检查
-                return soup, 99     # 返回一个小于100的错误码，表明是从本地返回的
+                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的
 
     for attempt in range(max_retries):
         try:
             if host_url not in url.lower():
                 logging.error(f'wrong url format: {url}')
-                return None, 601
+                return None, http_code_url
             
             response = scraper.get(url, headers=headers)
 
             # 处理 HTTP 状态码
             if response.status_code == 404:
                 logging.debug(f"Page not found (404): {url}")
-                return None, 404  # 直接返回 404，调用方可以跳过
+                return None, http_code_404  # 直接返回 404，调用方可以跳过
             
             response.raise_for_status()  # 处理 HTTP 错误
 
             # 过期的网页，与404相同处理
             if "invalid or outdated page" in response.text.lower():
                 logging.debug(f"invalid or outdated page: {url}")
-                return None, 404  # 直接返回 404，调用方可以跳过                
+                return None, http_code_404  # 直接返回 404，调用方可以跳过                
 
             if save_raw_html:
                 utils.write_raw_html(url, response.text)
diff --git a/javdb/src/fetch.py b/javdb/src/fetch.py
index cf4824e..61bfb6b 100644
--- a/javdb/src/fetch.py
+++ b/javdb/src/fetch.py
@@ -14,7 +14,9 @@ config.setup_logging()
 
 debug = False
 force = False
-skip_local = True
+skip_local = False
+from_actor = False
+abnormal_only = False
 
 # 获取演员列表
 def fetch_actor_list():    
@@ -152,10 +154,20 @@ def fetch_performers_detail():
     limit_count = 5 if debug else 100
     perfomers_list = []
     last_perfomer_id = 0
+    abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
     while True:
         # 每次从数据库中取一部分，避免一次全量获取        
         if force:   # 从头逐个遍历
-            perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
+            if from_actor:
+                if abnormal_only:
+                    perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in    =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
+                else:
+                    perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
+            else:
+                if abnormal_only:
+                    perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_in    =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
+                else:
+                    perfomers_list = db_tools.query_actors(start_id=last_perfomer_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
         else:       # 只做更新
             perfomers_list = db_tools.query_actors(is_full_data=0, limit=limit_count)
         if len(perfomers_list) < 1:
@@ -182,13 +194,13 @@ def fetch_performers_detail():
                         alias = data.get('alias', [])
                         all_movies.extend(data.get('movies', []))
 
-                elif status_code  and status_code == 404:
-                    actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=2)
+                elif status_code  and status_code == scraper.http_code_404:
+                    actor_id = db_tools.insert_or_update_actor_404(name=person, href=url, is_full_data=scraper.http_code_404)
                     logging.warning(f'404 page. id: {actor_id}, name: ({person}), url: {url}, Skiping...')
                     need_insert = False
                     break
-                elif status_code  and status_code == 401:
-                    actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=3)
+                elif status_code  and status_code == scraper.http_code_login:
+                    actor_id = db_tools.insert_or_update_movie_404(name=person, href=url, is_full_data=scraper.http_code_login)
                     logging.warning(f'401 page(need login). id: {actor_id}, name: ({person}), url: {url}, Skiping...')
                     need_insert = False
                     break
@@ -225,9 +237,19 @@ def fetch_movies_detail():
     limit_count = 10 if debug else 100
     movies_list = []
     last_movie_id = 0
+    abnormal_codes = [scraper.http_code_404, scraper.http_code_login]
     while True:        
         if force:   # 从头逐个遍历
-            movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=[2,3], order_by='id asc', limit=limit_count, from_actor_list=1)
+            if from_actor:
+                if abnormal_only:
+                    movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in    =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
+                else:
+                    movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=1)
+            else:
+                if abnormal_only:
+                    movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_in    =abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
+                else:
+                    movies_list = db_tools.query_movie_hrefs(start_id=last_movie_id, is_full_data_not_in=abnormal_codes, order_by='id asc', limit=limit_count, from_actor_list=0)
         else:       # 只做更新
             movies_list = db_tools.query_movie_hrefs(is_full_data=0, limit=limit_count)
         if len(movies_list) < 1:
@@ -241,8 +263,9 @@ def fetch_movies_detail():
             logging.debug(f"Fetching data for movie ({title}), url {url} ...")
             soup, status_code = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="video-meta-panel", attr_type="class"))
             # 从本地读取的文件，忽略
-            if skip_local and status_code == 99 :
+            if skip_local and status_code == scraper.http_code_local :
                 last_movie_id = curr_id
+                succ_count += 1
                 continue
             # 解析页面，写入数据库
             if soup:
@@ -258,11 +281,11 @@ def fetch_movies_detail():
                 else:
                     logging.warning(f'parse_page_movie error. url: {url}')
 
-            elif status_code  and status_code == 404:
-                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=2)
+            elif status_code  and status_code == scraper.http_code_404:
+                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_404)
                 logging.warning(f'404 page. id: {movie_id}, title: ({title}), url: {url}, Skiping...')
-            elif status_code  and status_code == 401:
-                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=3)
+            elif status_code  and status_code == scraper.http_code_login:
+                movie_id = db_tools.insert_or_update_movie_404(title=title, href=url, is_full_data=scraper.http_code_login)
                 logging.warning(f'401 page(need login). id: {movie_id}, title: ({title}), url: {url}, Skiping...')
             else:
                 logging.warning(f'fetch_page error. url: {url}')
@@ -285,12 +308,12 @@ function_map = {
 }   
 
 # 主函数
-def main(cmd, args_debug, args_force, args_skip_local):
+def main(cmd, args_debug, args_force, args_skip_local, args_from_actor, args_abnormal_only):
     global debug
     debug = args_debug
     if debug:
         logger = logging.getLogger()
-        #logger.setLevel(logging.DEBUG)
+        logger.setLevel(logging.DEBUG)
 
     global force
     force = args_force
@@ -298,6 +321,12 @@ def main(cmd, args_debug, args_force, args_skip_local):
     global skip_local
     skip_local = args_skip_local
 
+    global from_actor
+    from_actor = args_from_actor
+    
+    global abnormal_only
+    abnormal_only = args_abnormal_only
+
     # 开启任务
     task_id = db_tools.insert_task_log()
     if task_id is None:
@@ -339,6 +368,8 @@ if __name__ == "__main__":
     parser.add_argument('--debug', action='store_true', help='Enable debug mode (limit records)')
     parser.add_argument('--force', action='store_true', help='force update (true for rewrite all)')
     parser.add_argument('--skip_local', action='store_true', help='skip if cached html (true for skip)')
+    parser.add_argument('--from_actor', action='store_true', help='只遍历来自 actor_list 的 演员或者影片 (在force模式下有效)')
+    parser.add_argument('--abnormal_only', action='store_true', help='只遍历异常URL(404或者需要登陆查看等) 的 演员或影片 (在force模式下有效)')
     args = parser.parse_args()
     
-    main(args.cmd, args.debug, args.force, args.skip_local)
+    main(args.cmd, args.debug, args.force, args.skip_local, args.from_actor, args.abnormal_only)
diff --git a/javdb/src/scraper.py b/javdb/src/scraper.py
index 84867fb..44c137f 100644
--- a/javdb/src/scraper.py
+++ b/javdb/src/scraper.py
@@ -25,6 +25,10 @@ headers = {
 }
 scraper = cloudscraper.create_scraper()
 
+http_code_404   = 404
+http_code_login = 401
+http_code_local = 99
+
 save_raw_html = True
 load_from_local = True
 
@@ -38,8 +42,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
 
             soup = BeautifulSoup(html_text, parser)
             if validator(soup):  # 进行自定义页面检查
-                logging.info(f"read from local. href: {url}")
-                return soup, 99     # 返回一个小于100的错误码，表明是从本地返回的
+                logging.debug(f"read from local. href: {url}")
+                return soup, http_code_local     # 返回一个小于100的错误码，表明是从本地返回的
 
     for attempt in range(max_retries):
         try:
@@ -51,8 +55,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
 
             # 处理 HTTP 状态码
             if response.status_code == 404:
-                logging.warning(f"Page not found (404): {url}")
-                return None, 404  # 直接返回 404，调用方可以跳过
+                logging.debug(f"Page not found (404): {url}")
+                return None, http_code_404  # 直接返回 404，调用方可以跳过
             
             response.raise_for_status()  # 处理 HTTP 错误
 
@@ -62,8 +66,8 @@ def fetch_page(url, validator, max_retries=3, parser="html.parser", preprocessor
                 soup = BeautifulSoup(response.text, parser)
                 # 判断是否为登录页面，
                 if soup.find('nav', class_='panel form-panel'):
-                    logging.warning(f"Page redirected to login page on {url}.")
-                    return None, 401
+                    logging.debug(f"Page redirected to login page on {url}.")
+                    return None, http_code_login
                 
             if save_raw_html:
                 utils.write_raw_html(url, response.text)