helper.urlDeployment のソースコード

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
引数URLのサイトにアクセスして、タイトルと最終画像URLをスクレイピングする。
最終画像URLの数字を展開した、URLリストをファイルに保存して、irvineに渡す。
    http:/hoge/10.jpg
    ↓
    http:/hoge/1.jpg ～ http:/hoge/10.jpg
irvineが起動してダウンロードが開始されるので、ダウンロードが終わったらirvineを手動で終了する。
irvineが終了したらダウンロードファイルをチェックする。
失敗している時は、拡張子を変えて、ファイルに保存して、irvineに渡す。
成功している時は、リネームしてzipして削除する。

Todo:
    - docstringを整える
"""
import sys
import copy
import inspect
import datetime
from dataclasses import dataclass
from urllib.parse import urlparse  # URLパーサー
from urllib.parse import urlunparse
import helper.chromeDriver
import helper.webFile
import helper.webFileList

# local source
from const import *



[ドキュメント]
@dataclass(frozen=True)
class UrlDeploymentValue:
    """URL展開クラスの値オブジェクト
    """
    url: str
    selectors: list
    title: str
    image_urls: list

    def __init__(self, url, selectors, title, image_urls):
        """完全コンストラクタパターン
        :param url: str 処理対象サイトURL
        :param selectors: list スクレイピングする際のセレクタリスト
        :param title: str 取得したサイトタイトル
        :param image_urls: list 取得した画像のURLリスト
        """
        if url is not None:
            if not self.is_url_only(url):
                raise ValueError(f"{self.__class__.__name__}.{inspect.stack()[1].function}"
                                 f"不正:引数urlがURLではない[{url}]")
            object.__setattr__(self, "url", url)
        if selectors is not None:
            object.__setattr__(self, "selectors", selectors)
        if title is not None:
            object.__setattr__(self, "title", title)
        if image_urls is not None:
            for image_url in image_urls:
                if not self.is_url_only(image_url):
                    raise ValueError(f"{self.__class__.__name__}.{inspect.stack()[1].function}"
                                     f"不正:引数last_image_urlがurlではない[{image_url}]")
            object.__setattr__(self, "image_urls", image_urls)


[ドキュメント]
    @staticmethod
    def is_url_only(string: str) -> bool:
        return len(urlparse(string).scheme) > 0





[ドキュメント]
class UrlDeployment:
    """
    「UrlDeploymentValue 値オブジェクト」
    「str スクレイプ対象URL, list selectors」が引数で渡されるケース
    「str 末尾画像URL, str タイトル」が引数で渡されるケース
    """
    value_object: UrlDeploymentValue = None
    url_list: list = []

    def __init__(self, value_object, selectors_or_title=None):
        if value_object is not None:
            if isinstance(value_object, UrlDeploymentValue):
                self.value_object = value_object
            elif isinstance(value_object, str):
                # URLかチェックする
                __parse = urlparse(value_object)
                if not __parse.scheme:
                    print('引数が不正です。URLではない？')
                    sys.exit(1)
                if __parse.path[-4:] == '.jpg' or __parse.path[-4:] == '.png':
                    __image_url = value_object
                    __title = selectors_or_title
                    __selectors = None
                else:
                    __selectors = selectors_or_title
                    page_url = value_object
                    __driver = helper.chromeDriver.ChromeDriver(value_object, __selectors)
                    items = __driver.get_items()

                    _title = None
                    if 'title_jp' in items:
                        _title = items['title_jp']
                    title_sub = None
                    if 'title_en' in items:
                        title_sub = items['title_en']
                    image_urls = None
                    if 'image_urls' in items:
                        image_urls = items['image_urls']
                    last_image_url = None
                    if 'image_url' in items:
                        last_image_url = items['image_url']
                    print(_title, title_sub, last_image_url, image_urls)
                    if _title and isinstance(_title, list):
                        _title = _title[0]
                    if title_sub and isinstance(title_sub, list):
                        title_sub = title_sub[0]
                    if last_image_url and isinstance(last_image_url, list):
                        last_image_url = last_image_url[0]
                    if image_urls and image_urls[0]:
                        last_image_url = image_urls[0]
                    if not last_image_url:
                        raise ValueError(f"エラー:last_image_urlが不正[{last_image_url}]")
                    if not _title:
                        if not title_sub:
                            # タイトルが得られない時は、タイトルを日時文字列にする
                            now = datetime.datetime.now()
                            _title = f'{now:%Y%m%d_%H%M%S}'
                        else:
                            _title = title_sub
                    __title = __driver.fixed_file_name(_title)
                    url_title = __driver.fixed_file_name(page_url)
                    target_path = f'{__title}：{url_title}.html'
                    __driver.save_source(target_path)
                    __image_url = last_image_url
                __image_urls = self.__deployment(__image_url)
                self.value_object = UrlDeploymentValue(value_object,
                                                       __selectors,
                                                       __title,
                                                       __image_urls,
                                                       )

    def __deployment(self, image_url):
        """末尾画像URLを展開して、URLリスト=url_listを作る
        :param image_url: str 末尾画像URL
        :return: list 展開した画像URLリスト
        """
        if not image_url:
            print('引数が不正です。空です。')
            sys.exit(1)
        __parse = urlparse(image_url)
        if not __parse.scheme:
            print('引数が不正です。URLではない？')
            sys.exit(1)
        # pathを/前後で分ける
        __path_before_name = __parse.path[:__parse.path.rfind('/') + 1]
        __path_after_name = __parse.path[__parse.path.rfind('/') + 1:]
        print(__path_before_name)
        print(__path_after_name)
        # path_after_nameを.前後で分ける
        __base_name = __path_after_name[:__path_after_name.rfind('.')]
        __extend_name = __path_after_name[__path_after_name.rfind('.'):]
        print(__base_name)
        print(__extend_name)
        if not __base_name.isdecimal():
            print('引数が不正です。数値ではない？')
            sys.exit(1)
        __count = int(__base_name)
        for d_count in range(__count):
            self.url_list.append(urlunparse((__parse.scheme,
                                             __parse.netloc,
                                             __path_before_name + str(d_count + 1) + __extend_name,
                                             __parse.params,
                                             __parse.query,
                                             __parse.fragment)))
        return self.url_list


[ドキュメント]
    def get_title(self):
        """タイトル取得
        :return: str タイトル
        """
        return copy.deepcopy(self.value_object.title)



[ドキュメント]
    def get_image_urls(self):
        """画像リスト取得
        :return: list 画像リスト
        """
        return copy.deepcopy(self.value_object.image_urls)