From 19353a830c8f37b90087ab0e1fde918943265030 Mon Sep 17 00:00:00 2001 From: sophon Date: Sat, 19 Jul 2025 10:16:07 +0800 Subject: [PATCH] modify scripts --- scrapy_proj/scrapy_proj/comm/comm_def.py | 3 +- .../db_wapper/spider_db_handler.py | 71 ++++++++---- scrapy_proj/scrapy_proj/items.py | 14 ++- scrapy_proj/scrapy_proj/spiders/clm_spider.py | 102 ++++++++++-------- 4 files changed, 122 insertions(+), 68 deletions(-) diff --git a/scrapy_proj/scrapy_proj/comm/comm_def.py b/scrapy_proj/scrapy_proj/comm/comm_def.py index bed2398..6532a71 100644 --- a/scrapy_proj/scrapy_proj/comm/comm_def.py +++ b/scrapy_proj/scrapy_proj/comm/comm_def.py @@ -20,4 +20,5 @@ ITEM_TYPE_MOVIE_DETAIL = 'movie_detail' ITEM_TYPE_ACTOR_DETAIL = 'actor_detail' ITEM_TYPE_CLM_KEYWORDS = 'keywords' -ITEM_TYPE_CLM_INDEX = 'index' \ No newline at end of file +ITEM_TYPE_CLM_INDEX = 'index' +ITEM_TYPE_CLM_WORDS_INDEX = 'words_index' \ No newline at end of file diff --git a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py index a1eb47d..e89b274 100644 --- a/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py +++ b/scrapy_proj/scrapy_proj/db_wapper/spider_db_handler.py @@ -77,10 +77,11 @@ class ClmDBHandler(SQLiteDBHandler): super().__init__(db_path) self.tbl_name_clm_index = 'clm_index' self.tbl_name_clm_keywords = 'clm_keywords' + self.tbl_name_words_index = 'clm_keywords_index' def insert_item(self, item): if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX: - self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True) + self.insert_index(item) elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS: self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True) else: @@ -88,26 +89,56 @@ class ClmDBHandler(SQLiteDBHandler): return item + def insert_index(self, item): + row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True) + if row_id: + lnk_data = {} + lnk_data['words_id'] = item['key_words_id'] + lnk_data['index_id'] = row_id + lnk_data['wid_iid'] = f"{item['key_words_id']}_{row_id}" + lnk_data['tags'] = item['key_words'] + lnk_id = self.insert_or_update_common(lnk_data, self.tbl_name_words_index, uniq_key='wid_iid', exists_do_nothing=True) + if lnk_id: + logging.debug(f"insert one item: {lnk_data}") + else: + logging.warning(f"insert item error: {lnk_data}") + else: + logging.warning(f"insert index error: {item}") - def _create_tables(self): - # 创建 u001 数据表 - self.cursor.execute(f''' - CREATE TABLE clm_index ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - category TEXT, - title TEXT, - href TEXT UNIQUE, - magnet_href TEXT, - size_text TEXT, - size_gb REAL, - heat INTEGER default 0, - add_date TEXT, - last_down_date TEXT, - created_at TEXT DEFAULT (datetime('now', 'localtime')), - updated_at TEXT DEFAULT (datetime('now', 'localtime')) - ); - ''') - self.conn.commit() + # 按条件查询 href 列表 + def get_key_words(self, **filters): + try: + sql = f"SELECT id, words, groups FROM {self.tbl_name_clm_keywords} WHERE 1=1" + params = [] + + conditions = { + "id": " AND id = ?", + "words": " AND words LIKE ?", + "groups": " AND groups LIKE ?", + "start_id": " AND id > ?", + } + + for key, condition in conditions.items(): + if key in filters: + sql += condition + if key == "words" or key == 'groups': + params.append(f"%{filters[key]}%") + else: + params.append(filters[key]) + + if "order_by" in filters: + # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 + sql += f" ORDER BY {filters['order_by']} " + + if 'limit' in filters: + sql += " LIMIT ?" + params.append(filters["limit"]) + + self.cursor.execute(sql, params) + return [dict(row) for row in self.cursor.fetchall()] + except sqlite3.Error as e: + logging.error(f"查询 href 失败: {e}") + return None @register_handler(comm.SPIDER_NAME_IAFD) diff --git a/scrapy_proj/scrapy_proj/items.py b/scrapy_proj/scrapy_proj/items.py index d722366..73ac33a 100644 --- a/scrapy_proj/scrapy_proj/items.py +++ b/scrapy_proj/scrapy_proj/items.py @@ -138,6 +138,9 @@ class PBoxMovItem(scrapy.Item): class ClmKeyWordsItem(scrapy.Item): item_type = scrapy.Field() words = scrapy.Field() + groups = scrapy.Field() + tags = scrapy.Field() + index_count = scrapy.Field() class ClmIndexItem(scrapy.Item): item_type = scrapy.Field() @@ -149,4 +152,13 @@ class ClmIndexItem(scrapy.Item): size_gb = scrapy.Field() heat = scrapy.Field() add_date = scrapy.Field() - last_down_date = scrapy.Field() \ No newline at end of file + last_down_date = scrapy.Field() + key_words_id = scrapy.Field() + key_words = scrapy.Field() + +class ClmKeywordsIndexItem(scrapy.Item): + item_type = scrapy.Field() + words_id = scrapy.Field() + index_id = scrapy.Field() + wid_iid = scrapy.Field() + tags = scrapy.Field() diff --git a/scrapy_proj/scrapy_proj/spiders/clm_spider.py b/scrapy_proj/scrapy_proj/spiders/clm_spider.py index 6213a1f..fee4c74 100644 --- a/scrapy_proj/scrapy_proj/spiders/clm_spider.py +++ b/scrapy_proj/scrapy_proj/spiders/clm_spider.py @@ -5,36 +5,37 @@ from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS -from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler +from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler -db_tools = IAFDDBHandler() +db_clm = ClmDBHandler() +db_comm = IAFDDBHandler() default_keywords = [ - 'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group - 'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K - 'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World - 'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash - 'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network) - 'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn', # Nubiles Porn (Network) - 'Real Wife Stories', 'brazzers', # Brazzers - 'teenpies', 'shoplyfter', # TeamSkeet (Network) - 'BangBus', 'BangBros', # BangBros - 'nfbusty', 'NubileFilms', # Nubile Films - 'DDFBusty', # DDF Network - 'AdultTime', 'BurningAngel', # Adult Time (Network) - 'AnalVids', # Anal Vids - 'LegalPorno', - 'Pornworld', # Pornbox - 'WowGirls', # Wow (Network) - 'x-art', # Malibu Media - 'VIPissy', # VIPissy Cash - 'Japan AV Blu-Ray', # japan - 'siterip', # siterip - 'NewMFX', # Brazil - 'Wicked', # Wicked - 'Swallowed', # Sticky Dollars - 'ManyVids', # ManyVids - 'AnalOverdose', # PervCity + {'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']}, + {'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']}, + {'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']}, + {'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']}, + {'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']}, + {'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']}, + {'Brazzers' : ['Real Wife Stories', 'brazzers']}, + {'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']}, + {'BangBros' : ['BangBus', 'BangBros']}, + {'Nubile Films' : ['nfbusty', 'NubileFilms']}, + {'DDF Network' : ['DDFBusty']}, + {'Adult Time (Network)' : ['AdultTime', 'BurningAngel']}, + {'Anal Vids' : ['AnalVids']}, + {'LegalPorno' : ['LegalPorno']}, + {'Pornbox' : ['Pornworld']}, + {'Wow (Network)' : ['WowGirls']}, + {'Malibu Media' : ['x-art']}, + {'VIPissy Cash' : ['VIPissy']}, + {'japan Blu-Ray' : ['Japan AV Blu-Ray']}, + {'siterip' : ['siterip']}, + {'Brazil' : ['NewMFX']}, + {'Wicked' : ['Wicked']}, + {'Sticky Dollars' : ['Swallowed']}, + {'ManyVids' : ['ManyVids']}, + {'PervCity' : ['AnalOverdose']} ] class ClmSpider(BaseSpider): @@ -50,27 +51,32 @@ class ClmSpider(BaseSpider): self.keywords = keywords self.min_size = float(min_size) if min_size else 1.0 + #self.initDB() + + def initDB(self): + for row in default_keywords: + for group, items in row.items(): + for item in items: + words_item = ClmKeyWordsItem() + words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS + words_item['words'] = item + words_item['groups'] = group + words_item['tags'] = '' + words_item['index_count'] = 0 + db_tools.insert_item(words_item) + self.logger.debug(f"insert item: {item}: {group}") + # 入口函数,由基类的方法触发 def custom_start_requests(self): - list_words = self.keywords.split(',') if self.keywords else default_keywords - - item = ClmKeyWordsItem() - item['item_type'] = ITEM_TYPE_CLM_KEYWORDS - item['words'] = self.keywords if self.keywords else 'default keywords' - yield item - if self.debug: - actors = db_tools.get_lord_actors(limit = 5) + keywords = db_clm.get_key_words(limit =5) else: - actors = db_tools.get_lord_actors() - if actors: - for row in actors: - list_words.append(row['name']) - else: - self.logger.warning(f"get_lord_actors error.") - - for item in list_words: - encoded_keyword = quote_plus(item.strip()) + keywords = db_clm.get_key_words() + + for item in keywords: + words_id = item['id'] + words = item['words'] + encoded_keyword = quote_plus(words.strip()) # 构造POST表单数据 form_data = { @@ -85,7 +91,7 @@ class ClmSpider(BaseSpider): formdata=form_data, #headers=self._get_headers(), # 不自动跟随重定向,手动处理302 - meta={'dont_redirect': True, 'handle_httpstatus_list': [302]}, + meta={'dont_redirect': True, 'handle_httpstatus_list': [302], 'words_id': words_id, 'words': words}, callback=self.handle_redirect ) @@ -106,7 +112,8 @@ class ClmSpider(BaseSpider): yield scrapy.Request( url=result_url, #headers=self._get_headers(), - callback=self.parse_page_common + callback=self.parse_page_common, + meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')} ) @@ -164,6 +171,8 @@ class ClmSpider(BaseSpider): item['heat'] = int(heat) item['add_date'] = add_time item['last_down_date'] = last_download + item['key_words_id'] = response.meta.get('words_id', 0) + item['key_words'] = response.meta.get('words', '') yield item @@ -189,6 +198,7 @@ class ClmSpider(BaseSpider): yield scrapy.Request( url=next_page_url, callback=self.parse_page_common, + meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}, dont_filter=True # 允许重复请求(防止因URL参数被过滤) ) else: