Skip to content

Spider

core.spider

input_args = {'user': user} module-attribute

process = CrawlerProcess(settings={'ITEM_PIPELINES': {'src.repository.spider.RedisWriterPipeline': 1}}) module-attribute

user = TVTimeUser(username='string', password='string') module-attribute

JsonWriterPipeline

Source code in src/core/spider.py
138
139
140
141
142
143
144
145
146
147
148
class JsonWriterPipeline:
    def open_spider(self, spider):
        self.file = open("items.json", "w")

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

close_spider(spider)

Source code in src/core/spider.py
142
143
def close_spider(self, spider):
    self.file.close()

open_spider(spider)

Source code in src/core/spider.py
139
140
def open_spider(self, spider):
    self.file = open("items.json", "w")

process_item(item, spider)

Source code in src/core/spider.py
145
146
147
148
def process_item(self, item, spider):
    line = json.dumps(dict(item)) + "\n"
    self.file.write(line)
    return item

RedisWriterPipeline

Source code in src/core/spider.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
class RedisWriterPipeline:
    def __init__(self, user) -> None:
        self.redis = get_redis_connection(url=REDIS_URL)
        self.username = user.username
        TVTimeDataModel.Meta.database = get_redis_connection(url=REDIS_URL)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.spider.user)

    def open_spider(self, spider):
        logger.debug(f"Opening spider {spider.name}")
        self.data = TVTimeDataModel(username=self.username)
        self.data.save()
        self.data.expire(86400)

    def process_item(self, item, spider):
        logger.debug(f"Processing item {item}")
        name = item.get("name")
        if name == "id":
            self.data.user_id = item.get("data").get("user_id")
        elif name == "to-watch":
            self.data.watch_next = item.get("data")
        elif name == "upcoming":
            self.data.upcoming = item.get("data")
        elif name == "profile":
            self.data.profile = item.get("data")
        self.data.save()

    def close_spider(self, spider):
        logger.debug(f"Closing spider {spider.name}")
        if spider.USER_NOTFOUND_FLAG:
            self.data.expire(0)
        self.redis.close()

redis = get_redis_connection(url=REDIS_URL) instance-attribute

username = user.username instance-attribute

__init__(user)

Source code in src/core/spider.py
152
153
154
155
def __init__(self, user) -> None:
    self.redis = get_redis_connection(url=REDIS_URL)
    self.username = user.username
    TVTimeDataModel.Meta.database = get_redis_connection(url=REDIS_URL)

close_spider(spider)

Source code in src/core/spider.py
180
181
182
183
184
def close_spider(self, spider):
    logger.debug(f"Closing spider {spider.name}")
    if spider.USER_NOTFOUND_FLAG:
        self.data.expire(0)
    self.redis.close()

from_crawler(crawler) classmethod

Source code in src/core/spider.py
157
158
159
@classmethod
def from_crawler(cls, crawler):
    return cls(crawler.spider.user)

open_spider(spider)

Source code in src/core/spider.py
161
162
163
164
165
def open_spider(self, spider):
    logger.debug(f"Opening spider {spider.name}")
    self.data = TVTimeDataModel(username=self.username)
    self.data.save()
    self.data.expire(86400)

process_item(item, spider)

Source code in src/core/spider.py
167
168
169
170
171
172
173
174
175
176
177
178
def process_item(self, item, spider):
    logger.debug(f"Processing item {item}")
    name = item.get("name")
    if name == "id":
        self.data.user_id = item.get("data").get("user_id")
    elif name == "to-watch":
        self.data.watch_next = item.get("data")
    elif name == "upcoming":
        self.data.upcoming = item.get("data")
    elif name == "profile":
        self.data.profile = item.get("data")
    self.data.save()

TVTimeSpider

Bases: scrapy.Spider

Source code in src/core/spider.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
class TVTimeSpider(scrapy.Spider):
    name = "tvtime"
    USER_NOTFOUND_FLAG = False

    def __init__(self, user: TVTimeUser, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.user = user
        logger.info(f"User: {user}")

    def start_requests(self):
        logger.info(f"Starting Requests for {self.user.username}")
        yield scrapy.FormRequest(
            url="https://www.tvtime.com/signin",
            formdata={
                "username": self.user.username,
                "password": self.user.password,
            },
            callback=self.logged_in,
        )

    def logged_in(self, response):
        logger.debug(f"Logged in {self.user.username}")
        script_content = response.selector.xpath(
            '/html/head/script[contains(text(), "tvst.user")]/text()'
        ).get()
        user_id = re.search(r'(?<= id:[ ]")[0-9]*', script_content).group(0)
        if user_id == "":
            logger.error("User not found")
            self.USER_NOTFOUND_FLAG = True
            raise CloseSpider("User not found")
        user_id = int(user_id)
        yield {"name": "id", "data": {"user_id": user_id}}
        yield response.follow(TVTIME_TOWATCH_URL, self.parse_to_watch)
        yield response.follow(TVTIME_UPCOMING_URL, self.parse_upcoming)
        yield scrapy.Request(
            f"{TVTIME_PROFILE_URL}/{user_id}/profile", self.parse_profile
        )

    def parse_to_watch(self, response):
        result = {"name": "to-watch", "data": {}}
        items = response.selector.xpath('//*[@id="to-watch"]/ul')
        titles = response.selector.xpath('//*[@id="to-watch"]/h1')
        for idx in range(len(items)):
            title = titles[idx].xpath("./text()").get().strip()
            result["data"][title] = {}
            new_tags = items[idx].xpath('.//div[@class="new-label"]/text()').getall()
            shows = items[idx].xpath(".//img/@alt").getall()
            episodes = (
                items[idx]
                .xpath('.//div[@class="episode-details poster-details"]/h2/a/text()')
                .getall()
            )
            for show, episode in zip(shows, episodes):
                tag = False
                if len(new_tags) > 0:
                    tag = True
                    new_tags.pop(0)
                temp = {"episode": episode, "is_new": tag}
                result["data"][title][show] = temp

        # title_not_watched = items[0].xpath('.//img/@alt').getall()
        # title_not_started = items[1].xpath('.//img/@alt').getall()
        # result['data']['not-watched'] = title_not_watched
        # result['data']['not-started'] = title_not_started
        # script_content = response.selector.xpath('//div[@class="main-block-container"]/script/text()').get()
        # data_content = re.search(r'(tvst.data = )(.*)(;)', script_content, re.DOTALL).group(2)
        # data_content = data_content.replace('\&quot;', ' ')
        # data_content = data_content.replace("'", "")
        # data_content = data_content.replace('evt:', '')
        # data_content = data_content.replace('//', '')
        # data_content = data_content.replace("\&quot;", '"').replace("'[", "[").replace("]'", "]").replace('toWatchEpisodes', '"toWatchEpisodes"').replace('trendingShows', '"trendingShows"').replace('evt', '"evt"')
        # data_content = data_content.replace('\\"', '').replace('\\', '').replace("'", '"')
        # with open('data.json', 'w') as f:
        #     f.write(data_content)
        # data = json.loads(data_content)
        return result

    def parse_upcoming(self, response):
        result = {"name": "upcoming", "data": {}}
        items = response.selector.xpath('//*[@id="upcoming-episodes"]/ul/li')
        for item in items:
            title = item.xpath(
                './/div[@class="episode-details poster-details"]/a/text()'
            ).get()
            episode = item.xpath(
                '//*[@id="upcoming-episodes"]/ul/li/div[@class="episode-details poster-details"]/h2/a/text()'
            ).get()
            day = item.xpath('.//div[@class="overlay"]//ul/li/div/text()').get()
            if title:
                result["data"][title] = {episode: day}
        return result

    def parse_profile(self, response):
        result = {"name": "profile", "data": {}}
        script_content = response.selector.xpath(
            '//div[@class="main-block-container"]/script/text()'
        ).get()
        data_content = re.search(
            r"(tvst.data = )(.*)(;)", script_content, re.DOTALL
        ).group(2)
        data_content = (
            data_content.replace("\&quot;", '"')
            .replace("shows", '"shows"', 1)
            .replace("profile", '"profile"', 1)
            .replace("'[", "[")
            .replace("]'", "]")
            .replace("'{", "{")
            .replace("}'", "}")
            .replace("\&", "")
        )
        with open("data.json", "w") as f:
            f.write(data_content)
        result["data"] = json.loads(data_content)
        return result

USER_NOTFOUND_FLAG = False class-attribute

name = 'tvtime' class-attribute

user = user instance-attribute

__init__(user, *args, **kwargs)

Source code in src/core/spider.py
26
27
28
29
def __init__(self, user: TVTimeUser, *args, **kwargs):
    super().__init__(*args, **kwargs)
    self.user = user
    logger.info(f"User: {user}")

logged_in(response)

Source code in src/core/spider.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def logged_in(self, response):
    logger.debug(f"Logged in {self.user.username}")
    script_content = response.selector.xpath(
        '/html/head/script[contains(text(), "tvst.user")]/text()'
    ).get()
    user_id = re.search(r'(?<= id:[ ]")[0-9]*', script_content).group(0)
    if user_id == "":
        logger.error("User not found")
        self.USER_NOTFOUND_FLAG = True
        raise CloseSpider("User not found")
    user_id = int(user_id)
    yield {"name": "id", "data": {"user_id": user_id}}
    yield response.follow(TVTIME_TOWATCH_URL, self.parse_to_watch)
    yield response.follow(TVTIME_UPCOMING_URL, self.parse_upcoming)
    yield scrapy.Request(
        f"{TVTIME_PROFILE_URL}/{user_id}/profile", self.parse_profile
    )

parse_profile(response)

Source code in src/core/spider.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def parse_profile(self, response):
    result = {"name": "profile", "data": {}}
    script_content = response.selector.xpath(
        '//div[@class="main-block-container"]/script/text()'
    ).get()
    data_content = re.search(
        r"(tvst.data = )(.*)(;)", script_content, re.DOTALL
    ).group(2)
    data_content = (
        data_content.replace("\&quot;", '"')
        .replace("shows", '"shows"', 1)
        .replace("profile", '"profile"', 1)
        .replace("'[", "[")
        .replace("]'", "]")
        .replace("'{", "{")
        .replace("}'", "}")
        .replace("\&", "")
    )
    with open("data.json", "w") as f:
        f.write(data_content)
    result["data"] = json.loads(data_content)
    return result

parse_to_watch(response)

Source code in src/core/spider.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def parse_to_watch(self, response):
    result = {"name": "to-watch", "data": {}}
    items = response.selector.xpath('//*[@id="to-watch"]/ul')
    titles = response.selector.xpath('//*[@id="to-watch"]/h1')
    for idx in range(len(items)):
        title = titles[idx].xpath("./text()").get().strip()
        result["data"][title] = {}
        new_tags = items[idx].xpath('.//div[@class="new-label"]/text()').getall()
        shows = items[idx].xpath(".//img/@alt").getall()
        episodes = (
            items[idx]
            .xpath('.//div[@class="episode-details poster-details"]/h2/a/text()')
            .getall()
        )
        for show, episode in zip(shows, episodes):
            tag = False
            if len(new_tags) > 0:
                tag = True
                new_tags.pop(0)
            temp = {"episode": episode, "is_new": tag}
            result["data"][title][show] = temp

    # title_not_watched = items[0].xpath('.//img/@alt').getall()
    # title_not_started = items[1].xpath('.//img/@alt').getall()
    # result['data']['not-watched'] = title_not_watched
    # result['data']['not-started'] = title_not_started
    # script_content = response.selector.xpath('//div[@class="main-block-container"]/script/text()').get()
    # data_content = re.search(r'(tvst.data = )(.*)(;)', script_content, re.DOTALL).group(2)
    # data_content = data_content.replace('\&quot;', ' ')
    # data_content = data_content.replace("'", "")
    # data_content = data_content.replace('evt:', '')
    # data_content = data_content.replace('//', '')
    # data_content = data_content.replace("\&quot;", '"').replace("'[", "[").replace("]'", "]").replace('toWatchEpisodes', '"toWatchEpisodes"').replace('trendingShows', '"trendingShows"').replace('evt', '"evt"')
    # data_content = data_content.replace('\\"', '').replace('\\', '').replace("'", '"')
    # with open('data.json', 'w') as f:
    #     f.write(data_content)
    # data = json.loads(data_content)
    return result

parse_upcoming(response)

Source code in src/core/spider.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def parse_upcoming(self, response):
    result = {"name": "upcoming", "data": {}}
    items = response.selector.xpath('//*[@id="upcoming-episodes"]/ul/li')
    for item in items:
        title = item.xpath(
            './/div[@class="episode-details poster-details"]/a/text()'
        ).get()
        episode = item.xpath(
            '//*[@id="upcoming-episodes"]/ul/li/div[@class="episode-details poster-details"]/h2/a/text()'
        ).get()
        day = item.xpath('.//div[@class="overlay"]//ul/li/div/text()').get()
        if title:
            result["data"][title] = {episode: day}
    return result

start_requests()

Source code in src/core/spider.py
31
32
33
34
35
36
37
38
39
40
def start_requests(self):
    logger.info(f"Starting Requests for {self.user.username}")
    yield scrapy.FormRequest(
        url="https://www.tvtime.com/signin",
        formdata={
            "username": self.user.username,
            "password": self.user.password,
        },
        callback=self.logged_in,
    )