modify scripts
This commit is contained in:
@ -21,3 +21,4 @@ ITEM_TYPE_ACTOR_DETAIL = 'actor_detail'
|
|||||||
|
|
||||||
ITEM_TYPE_CLM_KEYWORDS = 'keywords'
|
ITEM_TYPE_CLM_KEYWORDS = 'keywords'
|
||||||
ITEM_TYPE_CLM_INDEX = 'index'
|
ITEM_TYPE_CLM_INDEX = 'index'
|
||||||
|
ITEM_TYPE_CLM_WORDS_INDEX = 'words_index'
|
||||||
@ -77,10 +77,11 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
super().__init__(db_path)
|
super().__init__(db_path)
|
||||||
self.tbl_name_clm_index = 'clm_index'
|
self.tbl_name_clm_index = 'clm_index'
|
||||||
self.tbl_name_clm_keywords = 'clm_keywords'
|
self.tbl_name_clm_keywords = 'clm_keywords'
|
||||||
|
self.tbl_name_words_index = 'clm_keywords_index'
|
||||||
|
|
||||||
def insert_item(self, item):
|
def insert_item(self, item):
|
||||||
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
if item['item_type'] == comm.ITEM_TYPE_CLM_INDEX:
|
||||||
self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
|
self.insert_index(item)
|
||||||
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
elif item['item_type'] == comm.ITEM_TYPE_CLM_KEYWORDS:
|
||||||
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
|
self.insert_or_update_common(item, self.tbl_name_clm_keywords, uniq_key=None, exists_do_nothing=True)
|
||||||
else:
|
else:
|
||||||
@ -88,26 +89,56 @@ class ClmDBHandler(SQLiteDBHandler):
|
|||||||
|
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
def insert_index(self, item):
|
||||||
|
row_id = self.insert_or_update_common(item, self.tbl_name_clm_index, uniq_key='href', exists_do_nothing=True)
|
||||||
|
if row_id:
|
||||||
|
lnk_data = {}
|
||||||
|
lnk_data['words_id'] = item['key_words_id']
|
||||||
|
lnk_data['index_id'] = row_id
|
||||||
|
lnk_data['wid_iid'] = f"{item['key_words_id']}_{row_id}"
|
||||||
|
lnk_data['tags'] = item['key_words']
|
||||||
|
lnk_id = self.insert_or_update_common(lnk_data, self.tbl_name_words_index, uniq_key='wid_iid', exists_do_nothing=True)
|
||||||
|
if lnk_id:
|
||||||
|
logging.debug(f"insert one item: {lnk_data}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert item error: {lnk_data}")
|
||||||
|
else:
|
||||||
|
logging.warning(f"insert index error: {item}")
|
||||||
|
|
||||||
def _create_tables(self):
|
# 按条件查询 href 列表
|
||||||
# 创建 u001 数据表
|
def get_key_words(self, **filters):
|
||||||
self.cursor.execute(f'''
|
try:
|
||||||
CREATE TABLE clm_index (
|
sql = f"SELECT id, words, groups FROM {self.tbl_name_clm_keywords} WHERE 1=1"
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
params = []
|
||||||
category TEXT,
|
|
||||||
title TEXT,
|
conditions = {
|
||||||
href TEXT UNIQUE,
|
"id": " AND id = ?",
|
||||||
magnet_href TEXT,
|
"words": " AND words LIKE ?",
|
||||||
size_text TEXT,
|
"groups": " AND groups LIKE ?",
|
||||||
size_gb REAL,
|
"start_id": " AND id > ?",
|
||||||
heat INTEGER default 0,
|
}
|
||||||
add_date TEXT,
|
|
||||||
last_down_date TEXT,
|
for key, condition in conditions.items():
|
||||||
created_at TEXT DEFAULT (datetime('now', 'localtime')),
|
if key in filters:
|
||||||
updated_at TEXT DEFAULT (datetime('now', 'localtime'))
|
sql += condition
|
||||||
);
|
if key == "words" or key == 'groups':
|
||||||
''')
|
params.append(f"%{filters[key]}%")
|
||||||
self.conn.commit()
|
else:
|
||||||
|
params.append(filters[key])
|
||||||
|
|
||||||
|
if "order_by" in filters:
|
||||||
|
# 注意:这里 order by 后面直接跟字段名,不能用占位符,否则会被当作字符串处理
|
||||||
|
sql += f" ORDER BY {filters['order_by']} "
|
||||||
|
|
||||||
|
if 'limit' in filters:
|
||||||
|
sql += " LIMIT ?"
|
||||||
|
params.append(filters["limit"])
|
||||||
|
|
||||||
|
self.cursor.execute(sql, params)
|
||||||
|
return [dict(row) for row in self.cursor.fetchall()]
|
||||||
|
except sqlite3.Error as e:
|
||||||
|
logging.error(f"查询 href 失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
@register_handler(comm.SPIDER_NAME_IAFD)
|
@register_handler(comm.SPIDER_NAME_IAFD)
|
||||||
|
|||||||
@ -138,6 +138,9 @@ class PBoxMovItem(scrapy.Item):
|
|||||||
class ClmKeyWordsItem(scrapy.Item):
|
class ClmKeyWordsItem(scrapy.Item):
|
||||||
item_type = scrapy.Field()
|
item_type = scrapy.Field()
|
||||||
words = scrapy.Field()
|
words = scrapy.Field()
|
||||||
|
groups = scrapy.Field()
|
||||||
|
tags = scrapy.Field()
|
||||||
|
index_count = scrapy.Field()
|
||||||
|
|
||||||
class ClmIndexItem(scrapy.Item):
|
class ClmIndexItem(scrapy.Item):
|
||||||
item_type = scrapy.Field()
|
item_type = scrapy.Field()
|
||||||
@ -150,3 +153,12 @@ class ClmIndexItem(scrapy.Item):
|
|||||||
heat = scrapy.Field()
|
heat = scrapy.Field()
|
||||||
add_date = scrapy.Field()
|
add_date = scrapy.Field()
|
||||||
last_down_date = scrapy.Field()
|
last_down_date = scrapy.Field()
|
||||||
|
key_words_id = scrapy.Field()
|
||||||
|
key_words = scrapy.Field()
|
||||||
|
|
||||||
|
class ClmKeywordsIndexItem(scrapy.Item):
|
||||||
|
item_type = scrapy.Field()
|
||||||
|
words_id = scrapy.Field()
|
||||||
|
index_id = scrapy.Field()
|
||||||
|
wid_iid = scrapy.Field()
|
||||||
|
tags = scrapy.Field()
|
||||||
|
|||||||
@ -5,36 +5,37 @@ from scrapy_proj.utils.utils import parse_size, parse_date_to_datetime
|
|||||||
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
from scrapy_proj.spiders.base_spider import BaseSpider, extract_text_from_element
|
||||||
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
|
from scrapy_proj.items import ClmIndexItem, ClmKeyWordsItem
|
||||||
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
|
from scrapy_proj.comm.comm_def import SPIDER_NAME_CLM, ITEM_TYPE_CLM_INDEX, ITEM_TYPE_CLM_KEYWORDS
|
||||||
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler
|
from scrapy_proj.db_wapper.spider_db_handler import IAFDDBHandler, ClmDBHandler
|
||||||
|
|
||||||
db_tools = IAFDDBHandler()
|
db_clm = ClmDBHandler()
|
||||||
|
db_comm = IAFDDBHandler()
|
||||||
|
|
||||||
default_keywords = [
|
default_keywords = [
|
||||||
'vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper', # vixen group
|
{'vixen group' : ['vixen', 'tushy', 'tushyraw', 'blacked', 'blackedraw', 'deeper']},
|
||||||
'Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k', # VIP 4K
|
{'VIP 4K' : ['Cuck4K', 'Daddy4k', 'Loan4k', 'Dyke4K', 'Rim4k', 'Pie4k', 'Ignore4K', 'Daddy4k', 'Stuck4k', 'Tutor4k']},
|
||||||
'anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels', # Teen Mega World
|
{'Teen Mega World' : ['anal-angels', 'Anal-Beauty', 'Beauty4k', 'creampie-angels', 'Beauty-Angels', 'FirstBGG', 'FuckStudies', 'OhMyHoles', 'X-Angels']},
|
||||||
'BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd', # Fuck You Cash
|
{'Fuck You Cash' : ['BBCPie', 'Tiny4k', 'Cum4K', 'Anal4K', 'Exotic4K', 'Facials4k', 'Holed', 'Lubed', 'Mom4K', 'passion hd']},
|
||||||
'Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings', # Naughty America (Network)
|
{'Naughty America (Network)' : ['Naughty Office', 'Naughty Americans', 'Naughty America', 'Naughty Weddings']},
|
||||||
'MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn', # Nubiles Porn (Network)
|
{'Nubiles Porn (Network)' : ['MyFamilyPies', 'StepSiblingsCaught', 'nubilesporn']},
|
||||||
'Real Wife Stories', 'brazzers', # Brazzers
|
{'Brazzers' : ['Real Wife Stories', 'brazzers']},
|
||||||
'teenpies', 'shoplyfter', # TeamSkeet (Network)
|
{'TeamSkeet (Network)' : ['teenpies', 'shoplyfter']},
|
||||||
'BangBus', 'BangBros', # BangBros
|
{'BangBros' : ['BangBus', 'BangBros']},
|
||||||
'nfbusty', 'NubileFilms', # Nubile Films
|
{'Nubile Films' : ['nfbusty', 'NubileFilms']},
|
||||||
'DDFBusty', # DDF Network
|
{'DDF Network' : ['DDFBusty']},
|
||||||
'AdultTime', 'BurningAngel', # Adult Time (Network)
|
{'Adult Time (Network)' : ['AdultTime', 'BurningAngel']},
|
||||||
'AnalVids', # Anal Vids
|
{'Anal Vids' : ['AnalVids']},
|
||||||
'LegalPorno',
|
{'LegalPorno' : ['LegalPorno']},
|
||||||
'Pornworld', # Pornbox
|
{'Pornbox' : ['Pornworld']},
|
||||||
'WowGirls', # Wow (Network)
|
{'Wow (Network)' : ['WowGirls']},
|
||||||
'x-art', # Malibu Media
|
{'Malibu Media' : ['x-art']},
|
||||||
'VIPissy', # VIPissy Cash
|
{'VIPissy Cash' : ['VIPissy']},
|
||||||
'Japan AV Blu-Ray', # japan
|
{'japan Blu-Ray' : ['Japan AV Blu-Ray']},
|
||||||
'siterip', # siterip
|
{'siterip' : ['siterip']},
|
||||||
'NewMFX', # Brazil
|
{'Brazil' : ['NewMFX']},
|
||||||
'Wicked', # Wicked
|
{'Wicked' : ['Wicked']},
|
||||||
'Swallowed', # Sticky Dollars
|
{'Sticky Dollars' : ['Swallowed']},
|
||||||
'ManyVids', # ManyVids
|
{'ManyVids' : ['ManyVids']},
|
||||||
'AnalOverdose', # PervCity
|
{'PervCity' : ['AnalOverdose']}
|
||||||
]
|
]
|
||||||
|
|
||||||
class ClmSpider(BaseSpider):
|
class ClmSpider(BaseSpider):
|
||||||
@ -50,27 +51,32 @@ class ClmSpider(BaseSpider):
|
|||||||
self.keywords = keywords
|
self.keywords = keywords
|
||||||
self.min_size = float(min_size) if min_size else 1.0
|
self.min_size = float(min_size) if min_size else 1.0
|
||||||
|
|
||||||
|
#self.initDB()
|
||||||
|
|
||||||
|
def initDB(self):
|
||||||
|
for row in default_keywords:
|
||||||
|
for group, items in row.items():
|
||||||
|
for item in items:
|
||||||
|
words_item = ClmKeyWordsItem()
|
||||||
|
words_item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
||||||
|
words_item['words'] = item
|
||||||
|
words_item['groups'] = group
|
||||||
|
words_item['tags'] = ''
|
||||||
|
words_item['index_count'] = 0
|
||||||
|
db_tools.insert_item(words_item)
|
||||||
|
self.logger.debug(f"insert item: {item}: {group}")
|
||||||
|
|
||||||
# 入口函数,由基类的方法触发
|
# 入口函数,由基类的方法触发
|
||||||
def custom_start_requests(self):
|
def custom_start_requests(self):
|
||||||
list_words = self.keywords.split(',') if self.keywords else default_keywords
|
|
||||||
|
|
||||||
item = ClmKeyWordsItem()
|
|
||||||
item['item_type'] = ITEM_TYPE_CLM_KEYWORDS
|
|
||||||
item['words'] = self.keywords if self.keywords else 'default keywords'
|
|
||||||
yield item
|
|
||||||
|
|
||||||
if self.debug:
|
if self.debug:
|
||||||
actors = db_tools.get_lord_actors(limit = 5)
|
keywords = db_clm.get_key_words(limit =5)
|
||||||
else:
|
else:
|
||||||
actors = db_tools.get_lord_actors()
|
keywords = db_clm.get_key_words()
|
||||||
if actors:
|
|
||||||
for row in actors:
|
|
||||||
list_words.append(row['name'])
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"get_lord_actors error.")
|
|
||||||
|
|
||||||
for item in list_words:
|
for item in keywords:
|
||||||
encoded_keyword = quote_plus(item.strip())
|
words_id = item['id']
|
||||||
|
words = item['words']
|
||||||
|
encoded_keyword = quote_plus(words.strip())
|
||||||
|
|
||||||
# 构造POST表单数据
|
# 构造POST表单数据
|
||||||
form_data = {
|
form_data = {
|
||||||
@ -85,7 +91,7 @@ class ClmSpider(BaseSpider):
|
|||||||
formdata=form_data,
|
formdata=form_data,
|
||||||
#headers=self._get_headers(),
|
#headers=self._get_headers(),
|
||||||
# 不自动跟随重定向,手动处理302
|
# 不自动跟随重定向,手动处理302
|
||||||
meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
|
meta={'dont_redirect': True, 'handle_httpstatus_list': [302], 'words_id': words_id, 'words': words},
|
||||||
callback=self.handle_redirect
|
callback=self.handle_redirect
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -106,7 +112,8 @@ class ClmSpider(BaseSpider):
|
|||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=result_url,
|
url=result_url,
|
||||||
#headers=self._get_headers(),
|
#headers=self._get_headers(),
|
||||||
callback=self.parse_page_common
|
callback=self.parse_page_common,
|
||||||
|
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -164,6 +171,8 @@ class ClmSpider(BaseSpider):
|
|||||||
item['heat'] = int(heat)
|
item['heat'] = int(heat)
|
||||||
item['add_date'] = add_time
|
item['add_date'] = add_time
|
||||||
item['last_down_date'] = last_download
|
item['last_down_date'] = last_download
|
||||||
|
item['key_words_id'] = response.meta.get('words_id', 0)
|
||||||
|
item['key_words'] = response.meta.get('words', '')
|
||||||
|
|
||||||
yield item
|
yield item
|
||||||
|
|
||||||
@ -189,6 +198,7 @@ class ClmSpider(BaseSpider):
|
|||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=next_page_url,
|
url=next_page_url,
|
||||||
callback=self.parse_page_common,
|
callback=self.parse_page_common,
|
||||||
|
meta={'words_id': response.meta.get('words_id', 0), 'words': response.meta.get('words', '')},
|
||||||
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
dont_filter=True # 允许重复请求(防止因URL参数被过滤)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user