stock/scripts/iafd/src/fetch.py


import json
import time
import csv
import argparse
import logging
from functools import partial
import config
import sqlite_utils as utils
import iafd_scraper as scraper
import utils as func

config.setup_logging()

debug = True

# 按星座获取演员列表，无翻页
def fetch_performers_by_astro(existed_performer_hrefs):
    performers = []

    for astro in scraper.astro_list:
        url = scraper.astr_base_url + astro
        logging.info(f"Fetching data for {astro}, url {url} ...")

        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="astro", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_astro(soup, astro)
            if list_data:
                for row in list_data :
                    if row['href'] not in existed_performer_hrefs:
                        performers.append({
                            'person' : row['person'],
                            'href' : row['href']
                        })
            else:
                logging.warning(f'fetch astro error. {url} ...')
        else:
            logging.warning(f'fetch astro error. {url} ...')

        # 调试添加break
        if debug:
            break
    return performers


# 按生日获取演员列表，无翻页
def fetch_performers_by_birth(existed_performer_hrefs):
    performers = []

    for month in range(1, 13):  # 遍历1到12月
        for day in range(1, 32):  # 遍历1到31天
            url = scraper.birth_base_url.format(month=month, day=day)
            logging.info(f"Fetching data for birth, url {url}")
            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-sm-12 col-lg-9", attr_type="class"))
            if soup:
                list_data, next_url = scraper.parse_page_birth(soup, month, day)
                if list_data:
                    for row in list_data :
                        if row['href'] not in existed_performer_hrefs:
                            performers.append({
                                'person' : row['person'],
                                'href' : row['href']
                            })
                else:
                    logging.warning(f'fetch astro error. {url} ...')
            else:
                logging.warning(f'fetch astro error. {url} ...')

            # 调试添加break
            if debug:
                return performers

    return performers

# 处理带空格的种族名
def format_ethnic(ethnic):
    return ethnic.replace(' ', '+')

# 按人种获取演员列表，有翻页
def fetch_performers_by_ethnic(existed_performer_hrefs):
    performers = []

    for ethnic in scraper.ethnic_list:
        url = scraper.ethnic_url + format_ethnic(ethnic)
        next_url = url

        while next_url:
            logging.info(f"Fetching data for {ethnic}, url {url} ...")
            soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="row headshotrow", attr_type="class"),
                        parser="lxml", preprocessor=scraper.preprocess_html)
            if soup:
                list_data, next_url = scraper.parse_page_ethnic(soup, ethnic)
                if list_data:
                    for row in list_data :
                        if row['href'] not in existed_performer_hrefs:
                            performers.append({
                                'person' : row['person'],
                                'href' : row['href']
                            })
                else:
                    logging.warning(f'fetch astro error. {url} ...')
            else:
                logging.warning(f'fetch astro error. {url} ...')

            # 调试添加break
            if debug:
                return performers
    return performers


# 获取distributors列表
def fetch_distributors_list(existed_distributors_href):
    url = scraper.distributors_list_url
    distributors_list = []

    logging.info(f"Fetching data for distributors list, url {url} ...")
    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Distrib", attr_type="name"))
    if soup:
        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Distrib")
        if list_data:
            for row in list_data :
                dis_url = scraper.distributors_base_url + row['href']
                if dis_url in existed_distributors_href :
                    continue
                distributors_list.append({
                    'name' : row['name'],
                    'href' : dis_url
                })
        else:
            logging.warning(f'fetch astro error. {url} ...')
    else:
        logging.warning(f'fetch astro error. {url} ...')
    return distributors_list

# 获取studios列表
def fetch_studios_list(existed_studios_href):
    url = scraper.studios_list_url
    studios_list = []

    logging.info(f"Fetching data for studios list, url {url} ...")
    soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="select", identifier="Studio", attr_type="name"))
    if soup:
        list_data, next_url = scraper.parse_page_dist_stu_list(soup, "Studio")
        if list_data:
            for row in list_data :
                stu_url = scraper.studios_base_url + row['href']
                if stu_url in existed_studios_href:
                    continue
                studios_list.append({
                    'name' : row['name'],
                    'href' : stu_url
                })
        else:
            logging.warning(f'fetch astro error. {url} ...')
    else:
        logging.warning(f'fetch astro error. {url} ...')
    return studios_list

# 获取更新
def check_update():
    # 读取数据库中的演员列表
    existed_performer_hrefs = utils.query_performer_hrefs()
    if not existed_performer_hrefs:
        logging.warning(f'get existed performers from db error.')
        return None

    # 从列表页获取新的演员
    new_performers = []
    #new_performers.extend(fetch_performers_by_astro(existed_performer_hrefs))
    #new_performers.extend(fetch_performers_by_birth(existed_performer_hrefs))
    new_performers.extend(fetch_performers_by_ethnic(existed_performer_hrefs))

    # 逐个获取演员信息，并写入到db中
    new_performers = list({item["href"]: item for item in new_performers}.values())
    logging.info(f'get new performers count: {len(new_performers)} ')
    for performer in new_performers:
        url = performer['href']
        person = performer['person']
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="headshot", attr_type="id"))
        if soup:
            data, credits = scraper.parse_page_performer(soup)
            if data:
                performer_id = utils.insert_or_update_performer({
                    'href': url,
                    'person': person,
                    **data
                })
                if performer_id:
                    logging.info(f'insert one person, id: {performer_id}, person: {person}, url: {url}')
                else:
                    logging.warning(f'insert person: {person} {url} failed.')

                # 写入到本地json文件
                func.write_person_json(person, url, {
                    'href': url,
                    'person': person,
                    **data,
                    'credits': credits if credits else {}
                })
            else:
                logging.warning(f'parse_page_performer error. person: {person}, url: {url}')
        else:
            logging.warning(f'fetch_page error. person: {person}, url: {url}')
        # 调试break
        if debug:
            break

    # 从数据库读取distributors列表
    existed_distributors_href = utils.query_distributor_hrefs()
    if existed_distributors_href is None:
        logging.warning(f'get existed distributors from db error.')
        return
    new_distributors = fetch_distributors_list(existed_distributors_href)
    for dist in new_distributors:
        dist_id = utils.insert_or_update_distributor(dist)
        if dist_id:
            logging.info(f'insert one distributor record, id: {dist_id}, name: {dist['name']}, href: {dist['href']}')
        else:
            logging.warning(f'insert into studio failed. name: {dist['name']} href: {dist['href']}')

    # 从数据库读取studios列表
    existed_studios_href = utils.query_studio_hrefs()
    if existed_studios_href is None:
        logging.warning(f'get existed studios from db error.')
        return
    new_studios = fetch_studios_list(existed_studios_href)
    for stu in new_studios:
        stu_id = utils.insert_or_update_studio(stu)
        if stu_id:
            logging.info(f'insert one studio record, id: {stu_id}, name: {stu['name']}, href: {stu['href']}')
        else:
            logging.warning(f'insert into studio failed. name: {stu['name']}, href: {stu['href']}')

    # 从数据库中读取影片列表
    existed_movies = utils.query_movie_hrefs()
    if existed_movies is None:
        logging.warning(f'load movies from db error')
        return
    new_movies = []
    new_movie_hrefs = []

    # 遍历所有 distributors，获取 movies 列表
    existed_distributors_href = utils.query_distributor_hrefs(name='vixen')
    if existed_distributors_href is None:
        logging.warning(f'get existed distributors from db error.')
        return
    for url in existed_distributors_href:
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="distable", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_dist_stu(soup, 'distable')
            if list_data:
                for movie in list_data:
                    if movie['href'] in existed_movies:
                        continue
                    new_movies.append({
                        'title' : movie['title'],
                        'href' : movie['href']
                    })
                    new_movie_hrefs.append(movie['href'])
            else :
                logging.warning(f'parse_page_movie error. url: {url}')
        # 调试增加brak
        if debug:
            break
    logging.info(f'all new moives found for distributors, now total new {len(new_movies)}')

    # 遍历所有 studios，获取 movies 列表
    existed_studios_href = utils.query_studio_hrefs(name='vixen')
    if existed_studios_href is None:
        logging.warning(f'get existed studios from db error.')
        return
    for url in existed_studios_href:
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="table", identifier="studio", attr_type="id"))
        if soup:
            list_data, next_url = scraper.parse_page_dist_stu(soup, 'studio')
            if list_data:
                for movie in list_data:
                    if movie['href'] in existed_movies and movie['href'] in new_movie_hrefs:
                        continue
                    new_movies.append({
                        'title' : movie['title'],
                        'href' : movie['href']
                    })
                    new_movie_hrefs.append(movie['href'])
            else :
                logging.warning(f'parse_page_movie error. url: {url}')
        # 调试增加brak
        if debug:
            break
    logging.info(f'all new moives found for studios, now total new {len(new_movies)}')

    # 对新的影片，逐个获取内容
    new_movies = list({item["href"]: item for item in new_movies}.values())
    logging.info(f'get merged new movies,  count: {len(new_movies)} ')
    for movie in new_movies:
        url = movie['href']
        title = movie['title']
        soup = scraper.fetch_page(url, partial(scraper.generic_validator, tag="div", identifier="col-xs-12 col-sm-3", attr_type="class"))
        if soup:
            movie_data = scraper.parse_page_movie(soup, url, title)
            if movie_data :
                movie_id = utils.insert_or_update_movie(movie_data)
                if movie_id:
                    logging.info(f'insert one movie, id: {movie_id}, title: {title} url: {url}')
                else:
                    logging.warning(f'insert movie {url} failed.')

                # 写入到本地json文件
                func.write_movie_json(url, movie_data)
            else:
                logging.warning(f'parse_page_movie error. url: {url}')
        else:
            logging.warning(f'fetch_page error. url: {url}')
        # 调试增加break
        if debug:
            break

    logging.info(f'all process completed!')
if __name__ == "__main__":
    check_update()