import json import time import csv import argparse import logging from functools import partial import config import sqlite_utils as utils import iafd_scraper as scraper import utils as func config.setup_logging() debug = True # 按星座获取演员列表,无翻页 def fetch_performers_by_astro(existed_performer_hrefs): performers = [] for astro in scraper.astro_list: url = scraper.astr_base_url + astro logging.info(f"Fetching data for {astro}, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id")) if soup: list_data, next_url = scraper.parse_page_astro(soup, astro) if list_data: for row in list_data : if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], 'href' : row['href'] }) else: logging.warning(f'fetch astro error. {url} ...') else: logging.warning(f'fetch astro error. {url} ...') # 调试添加break if debug: break return performers # 按生日获取演员列表,无翻页 def fetch_performers_by_birth(existed_performer_hrefs): performers = [] for month in range(1, 13): # 遍历1到12月 for day in range(1, 32): # 遍历1到31天 url = scraper.birth_base_url.format(month=month, day=day) logging.info(f"Fetching data for birth, url {url}") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class")) if soup: list_data, next_url = scraper.parse_page_birth(soup, month, day) if list_data: for row in list_data : if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], 'href' : row['href'] }) else: logging.warning(f'fetch astro error. {url} ...') else: logging.warning(f'fetch astro error. {url} ...') # 调试添加break if debug: return performers return performers # 处理带空格的种族名 def format_ethnic(ethnic): return ethnic.replace(' ', '+') # 按人种获取演员列表,有翻页 def fetch_performers_by_ethnic(existed_performer_hrefs): performers = [] for ethnic in scraper.ethnic_list: url = scraper.ethnic_url + format_ethnic(ethnic) next_url = url while next_url: logging.info(f"Fetching data for {ethnic}, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"), parser="lxml", preprocessor=scraper.preprocess_html) if soup: list_data, next_url = scraper.parse_page_ethnic(soup, ethnic) if list_data: for row in list_data : if row['href'] not in existed_performer_hrefs: performers.append({ 'person' : row['person'], 'href' : row['href'] }) else: logging.warning(f'fetch astro error. {url} ...') else: logging.warning(f'fetch astro error. {url} ...') # 调试添加break if debug: return performers return performers # 获取distributors列表 def fetch_distributors_list(existed_distributors_href): url = scraper.distributors_list_url distributors_list = [] logging.info(f"Fetching data for distributors list, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name")) if soup: list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib") if list_data: for row in list_data : dis_url = scraper.distributors_base_url + row['href'] if dis_url in existed_distributors_href : continue distributors_list.append({ 'name' : row['name'], 'href' : dis_url }) else: logging.warning(f'fetch astro error. {url} ...') else: logging.warning(f'fetch astro error. {url} ...') return distributors_list # 获取studios列表 def fetch_studios_list(existed_studios_href): url = scraper.studios_list_url studios_list = [] logging.info(f"Fetching data for studios list, url {url} ...") soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name")) if soup: list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio") if list_data: for row in list_data : stu_url = scraper.studios_base_url + row['href'] if stu_url in existed_studios_href: continue studios_list.append({ 'name' : row['name'], 'href' : stu_url }) else: logging.warning(f'fetch astro error. {url} ...') else: logging.warning(f'fetch astro error. {url} ...') return studios_list # 获取更新 def check_update(): # 读取数据库中的演员列表 existed_performer_hrefs = utils.query_performer_hrefs() if not existed_performer_hrefs: logging.warning(f'get existed performers from db error.') return None # 从列表页获取新的演员 new_performers = [] #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs)) #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs)) new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs)) # 逐个获取演员信息,并写入到db中 new_performers = list({item["href"]: item for item in new_performers}.values()) logging.info(f'get new performers count: {len(new_performers)} ') for performer in new_performers: url = performer['href'] person = performer['person'] soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id")) if soup: data, credits = scraper.parse_page_performer(soup) if data: performer_id = utils.insert_or_update_performer({ 'href': url, 'person': person, **data }) if performer_id: logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}') else: logging.warning(f'insert person: {person} {url} failed.') # 写入到本地json文件 func.write_person_json(person, url, { 'href': url, 'person': person, **data, 'credits': credits if credits else {} }) else: logging.warning(f'parse_page_performer error. person: {person}, url: {url}') else: logging.warning(f'fetch_page error. person: {person}, url: {url}') # 调试break if debug: break # 从数据库读取distributors列表 existed_distributors_href = utils.query_distributor_hrefs() if existed_distributors_href is None: logging.warning(f'get existed distributors from db error.') return new_distributors = fetch_distributors_list(existed_distributors_href) for dist in new_distributors: dist_id = utils.insert_or_update_distributor(dist) if dist_id: logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}') else: logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}') # 从数据库读取studios列表 existed_studios_href = utils.query_studio_hrefs() if existed_studios_href is None: logging.warning(f'get existed studios from db error.') return new_studios = fetch_studios_list(existed_studios_href) for stu in new_studios: stu_id = utils.insert_or_update_studio(stu) if stu_id: logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}') else: logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}') # 从数据库中读取影片列表 existed_movies = utils.query_movie_hrefs() if existed_movies is None: logging.warning(f'load movies from db error') return new_movies = [] new_movie_hrefs = [] # 遍历所有 distributors,获取 movies 列表 existed_distributors_href = utils.query_distributor_hrefs(name='vixen') if existed_distributors_href is None: logging.warning(f'get existed distributors from db error.') return for url in existed_distributors_href: soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id")) if soup: list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable') if list_data: for movie in list_data: if movie['href'] in existed_movies: continue new_movies.append({ 'title' : movie['title'], 'href' : movie['href'] }) new_movie_hrefs.append(movie['href']) else : logging.warning(f'parse_page_movie error. url: {url}') # 调试增加brak if debug: break logging.info(f'all new moives found for distributors, now total new {len(new_movies)}') # 遍历所有 studios,获取 movies 列表 existed_studios_href = utils.query_studio_hrefs(name='vixen') if existed_studios_href is None: logging.warning(f'get existed studios from db error.') return for url in existed_studios_href: soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id")) if soup: list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio') if list_data: for movie in list_data: if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs: continue new_movies.append({ 'title' : movie['title'], 'href' : movie['href'] }) new_movie_hrefs.append(movie['href']) else : logging.warning(f'parse_page_movie error. url: {url}') # 调试增加brak if debug: break logging.info(f'all new moives found for studios, now total new {len(new_movies)}') # 对新的影片,逐个获取内容 new_movies = list({item["href"]: item for item in new_movies}.values()) logging.info(f'get merged new movies, count: {len(new_movies)} ') for movie in new_movies: url = movie['href'] title = movie['title'] soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class")) if soup: movie_data = scraper.parse_page_movie(soup, url, title) if movie_data : movie_id = utils.insert_or_update_movie(movie_data) if movie_id: logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}') else: logging.warning(f'insert movie {url} failed.') # 写入到本地json文件 func.write_movie_json(url, movie_data) else: logging.warning(f'parse_page_movie error. url: {url}') else: logging.warning(f'fetch_page error. url: {url}') # 调试增加break if debug: break logging.info(f'all process completed!') if __name__ == "__main__": check_update()