modify scripts

This commit is contained in:
2025-07-19 23:59:47 +08:00
parent 6522970dcb
commit e6e7680542
3 changed files with 166 additions and 106 deletions

View File

@ -84,13 +84,17 @@ class ClmDBHandler(SQLiteDBHandler):
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX: if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
self.insert_index(item) self.insert_index(item)
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS: elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=False) self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key='words', exists_do_nothing=False)
else: else:
logging.error(f"unkown item.") logging.error(f"unkown item.")
return item return item
def insert_index(self, item): def insert_index(self, item):
if item['is_update']: # 仅更新
self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=False)
return
row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True) row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
if row_id: if row_id:
lnk_data = {} lnk_data = {}
@ -106,6 +110,19 @@ class ClmDBHandler(SQLiteDBHandler):
else: else:
logging.warning(f"insert index error: {item}") logging.warning(f"insert index error: {item}")
def get_empty_title(self):
try:
self.cursor.execute(f"SELECT id, href FROM {self.tbl_name_clm_index} WHERE title='' ")
return [dict(row) for row in self.cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"查询 href 失败: {e}")
return None
def get_count_by_keywords_id(self, key_words_id):
self.cursor.execute(f"SELECT count(*) as cnt from {self.tbl_name_words_index} WHERE words_id = ?", (key_words_id,))
row = self.cursor.fetchone()
return row[0] if row else None
# 按条件查询 href 列表 # 按条件查询 href 列表
def get_key_words(self, **filters): def get_key_words(self, **filters):
try: try:
@ -128,6 +145,9 @@ class ClmDBHandler(SQLiteDBHandler):
else: else:
params.append(filters[key]) params.append(filters[key])
if "query_str" in filters:
sql += f" AND {filters['query_str']} "
if "order_by" in filters: if "order_by" in filters:
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理 # 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
sql += f" ORDER BY {filters['order_by']} " sql += f" ORDER BY {filters['order_by']} "

View File

@ -155,6 +155,7 @@ class ClmIndexItem(scrapy.Item):
last_down_date = scrapy.Field() last_down_date = scrapy.Field()
key_words_id = scrapy.Field() key_words_id = scrapy.Field()
key_words = scrapy.Field() key_words = scrapy.Field()
is_update = scrapy.Field()
class ClmKeywordsIndexItem(scrapy.Item): class ClmKeywordsIndexItem(scrapy.Item):
item_type = scrapy.Field() item_type = scrapy.Field()

View File

@ -8,35 +8,6 @@ from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
db_clm = ClmDBHandler() db_clm = ClmDBHandler()
db_comm = IAFDDBHandler()
default_keywords = [
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
{'BangBros' : ['BangBus', 'BangBros']},
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
{'DDF Network' : ['DDFBusty']},
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
{'Anal Vids' : ['AnalVids']},
{'LegalPorno' : ['LegalPorno']},
{'Pornbox' : ['Pornworld']},
{'Wow (Network)' : ['WowGirls']},
{'Malibu Media' : ['x-art']},
{'VIPissy Cash' : ['VIPissy']},
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
{'siterip' : ['siterip']},
{'Brazil' : ['NewMFX']},
{'Wicked' : ['Wicked']},
{'Sticky Dollars' : ['Swallowed']},
{'ManyVids' : ['ManyVids']},
{'PervCity' : ['AnalOverdose']}
]
class ClmSpider(BaseSpider): class ClmSpider(BaseSpider):
name = SPIDER_NAME_CLM name = SPIDER_NAME_CLM
@ -51,6 +22,7 @@ class ClmSpider(BaseSpider):
self.keywords = keywords self.keywords = keywords
self.min_size = float(min_size) if min_size else 1.0 self.min_size = float(min_size) if min_size else 1.0
self.run_task = True self.run_task = True
self.fix_title = False
# 增加一个暗号 # 增加一个暗号
if keywords and keywords.lower() == 'reload' : if keywords and keywords.lower() == 'reload' :
@ -59,86 +31,38 @@ class ClmSpider(BaseSpider):
self.run_task = False self.run_task = False
self.logger.info(f"reload keywords db succ!") self.logger.info(f"reload keywords db succ!")
# 指定的关键词,导入到数据库 # 增加一个暗号
def initDB(self): if keywords and keywords.lower() == 'fix' :
for row in default_keywords: self.fix_title = True
for group, items in row.items(): self.run_task = False
for item in items:
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = item
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
# 从其他数据源获取到演员列表,导入到数据库
def init_load_actors_from_others(self):
all_likes = {
'vixen' : ['vixen.com', 'Vixen Video'],
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
'x-art' : ['x-art.com', 'X-art'],
'nfbusty' : ['nfbusty.com']
}
# 先转换个格式
all_key_group = {}
all_keys = []
for group, keys in all_likes.items():
for key in keys:
all_key_group[key] = group
all_keys.append(key)
# 查询数据库,并转换数据
actor_tags = {}
total_lines = 0
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
for dist, actors in results.items():
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
total_lines += len(actors)
for actor in actors :
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
actor_name = actor['name']
current_tag = all_key_group.get(dist, '')
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
if current_tag:
actor_tags[actor_name].add(current_tag) # set的add方法重复值会自动忽略
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
# 查询另一个数据表,获取结果
load_results = db_comm.get_lord_actors()
if load_results:
self.logger.info(f"total actors in lord: {len(load_results)}")
for row in load_results:
actor_name = row['name']
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
actor_tags[actor_name].add('thelordofporn') # set的add方法重复值会自动忽略
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
for actor, tags_set in actor_tags.items():
tag_str = ','.join(tags_set) # set直接支持迭代无需额外转换
self.logger.info(f"actor: {actor}, tags: {tag_str}")
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = actor
words_item['groups'] = 'actress'
words_item['tags'] = tag_str
words_item['index_count'] = 0
db_clm.insert_item(words_item)
#self.logger.debug(f"insert item: {words_item}")
# 入口函数,由基类的方法触发 # 入口函数,由基类的方法触发
def custom_start_requests(self): def custom_start_requests(self):
if self.fix_title:
data = db_clm.get_empty_title()
if data:
self.logger.info(f"rows to be fixed: {len(data)}")
for row in data:
url = row['href']
# 递归请求下一页
yield scrapy.Request(
url=url,
callback=self.parse_page_detail,
meta={'url': url},
dont_filter=True # 允许重复请求防止因URL参数被过滤
)
else:
self.logger.warning(f"no data.")
if not self.run_task: if not self.run_task:
return return
tmp_query_str = f" groups='actress' and tags not like '%vixen%' "
if self.debug: if self.debug:
keywords = db_clm.get_key_words(limit =5) keywords = db_clm.get_key_words(limit =5, query_str = tmp_query_str)
else: else:
keywords = db_clm.get_key_words(groups='actress', tags='vixen') #keywords = db_clm.get_key_words(groups='actress', tags='vixen')
keywords = db_clm.get_key_words(query_str = tmp_query_str)
for item in keywords: for item in keywords:
words_id = item['id'] words_id = item['id']
@ -197,7 +121,8 @@ class ClmSpider(BaseSpider):
# h3 下的 a 标签(标题链接) # h3 下的 a 标签(标题链接)
h3_a = ssbox.xpath('.//div[@class="title"]/h3/a') h3_a = ssbox.xpath('.//div[@class="title"]/h3/a')
# 标题文本(如 "Vixen.2025.05" # 标题文本(如 "Vixen.2025.05"
title_text = h3_a.xpath('text()').get().strip() if h3_a else None #title_text = h3_a.xpath('text()').get().strip() if h3_a else None
title_text = extract_text_from_element(h3_a, use_title=True)
# 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html" # 标题链接(如 "/hash/34c71bf8ddff9c797dab7ee1af83894fee13ac67.html"
title_href = h3_a.xpath('@href').get() if h3_a else None title_href = h3_a.xpath('@href').get() if h3_a else None
# 若链接是相对路径可拼接成完整URL根据网站域名调整 # 若链接是相对路径可拼接成完整URL根据网站域名调整
@ -240,6 +165,7 @@ class ClmSpider(BaseSpider):
item['last_down_date'] = last_download item['last_down_date'] = last_download
item['key_words_id'] = response.meta.get('words_id', 0) item['key_words_id'] = response.meta.get('words_id', 0)
item['key_words'] = response.meta.get('words', '') item['key_words'] = response.meta.get('words', '')
item['is_update'] = False
yield item yield item
@ -249,7 +175,7 @@ class ClmSpider(BaseSpider):
# 解析下一页链接 # 解析下一页链接
pager = response.xpath('//div[@class="pager"]') pager = response.xpath('//div[@class="pager"]')
if pager: if pager:
total_text = pager.xpath('.//span[contains(text(), "")]/text()').get() if sbar else '' total_text = pager.xpath('.//span[contains(text(), "")]/text()').get()
# 定位“下一页”的a标签通过文本定位避免混淆其他a标签 # 定位“下一页”的a标签通过文本定位避免混淆其他a标签
next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get() next_page_a = pager.xpath('.//a[contains(text(), "下一页")]').get()
@ -270,5 +196,118 @@ class ClmSpider(BaseSpider):
) )
else: else:
# 当href为#或不存在时,说明已无下一页 # 当href为#或不存在时,说明已无下一页
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}') total_rows = db_clm.get_count_by_keywords_id(response.meta.get('words_id', 0))
curr_words = response.meta.get('words', '')
self.logger.info(f'已获取完所有页面,停止翻页. {total_text}, 共 {total_rows} 条记录。 key words: ({curr_words}), url: {response.url}')
def parse_page_detail(self, response):
# 匹配 class 为 'bt_title' 的 div 下的 h2 标签文本
title_xpath = response.xpath('//div[@class="bt_title"]/h2/text()').get()
item = ClmIndexItem()
item['item_type'] = ITEM_TYPE_CLM_INDEX
item['title'] = title_xpath
item['href'] = response.meta['url']
item['is_update'] = True
yield item
# 指定的关键词,导入到数据库
def initDB(self):
default_keywords = [
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
{'BangBros' : ['BangBus', 'BangBros']},
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
{'DDF Network' : ['DDFBusty']},
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
{'Anal Vids' : ['AnalVids']},
{'LegalPorno' : ['LegalPorno']},
{'Pornbox' : ['Pornworld']},
{'Wow (Network)' : ['WowGirls']},
{'Malibu Media' : ['x-art']},
{'VIPissy Cash' : ['VIPissy']},
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
{'siterip' : ['siterip']},
{'Brazil' : ['NewMFX']},
{'Wicked' : ['Wicked']},
{'Sticky Dollars' : ['Swallowed']},
{'ManyVids' : ['ManyVids']},
{'PervCity' : ['AnalOverdose']}
]
for row in default_keywords:
for group, items in row.items():
for item in items:
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = item
words_item['groups'] = group
words_item['tags'] = ''
words_item['index_count'] = 0
db_clm.insert_item(words_item)
self.logger.debug(f"insert item: {item}: {group}")
# 从其他数据源获取到演员列表,导入到数据库
def init_load_actors_from_others(self):
db_comm = IAFDDBHandler()
all_likes = {
'vixen' : ['vixen.com', 'Vixen Video'],
'tushy' : ['tushy.com', 'tushyraw.com', 'Tushy', 'Tushy Raw'],
'blacked' : ['blacked.com', 'Blacked', 'blackedraw.com', 'Blacked Raw'],
'x-art' : ['x-art.com', 'X-art'],
'nfbusty' : ['nfbusty.com']
}
# 先转换个格式
all_key_group = {}
all_keys = []
for group, keys in all_likes.items():
for key in keys:
all_key_group[key] = group
all_keys.append(key)
# 查询数据库,并转换数据
actor_tags = {}
total_lines = 0
results = db_comm.get_iafd_actors(names=all_keys, tbl='stu')
for dist, actors in results.items():
self.logger.info(f"dist: {dist}, actors count: {len(actors)}")
total_lines += len(actors)
for actor in actors :
#self.logger.debug(f"get {dist} : {actor['name']}, {actor['href']}")
actor_name = actor['name']
current_tag = all_key_group.get(dist, '')
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
if current_tag:
actor_tags[actor_name].add(current_tag) # set的add方法重复值会自动忽略
self.logger.info(f"total actors in iafd: {len(actor_tags)}, total lines: {total_lines}")
# 查询另一个数据表,获取结果
load_results = db_comm.get_lord_actors()
if load_results:
self.logger.info(f"total actors in lord: {len(load_results)}")
for row in load_results:
actor_name = row['name']
if actor_name not in actor_tags:
actor_tags[actor_name] = set() # 用set自动去重
actor_tags[actor_name].add('thelordofporn') # set的add方法重复值会自动忽略
self.logger.info(f"after merge, total actors: {len(actor_tags)}")
for actor, tags_set in actor_tags.items():
tag_str = ','.join(tags_set) # set直接支持迭代无需额外转换
self.logger.info(f"actor: {actor}, tags: {tag_str}")
words_item = ClmKeyWordsItem()
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
words_item['words'] = actor
words_item['groups'] = 'actress'
words_item['tags'] = tag_str
words_item['index_count'] = 0
db_clm.insert_item(words_item)
#self.logger.debug(f"insert item: {words_item}")