22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135 | class TVTimeSpider(scrapy.Spider):
name = "tvtime"
USER_NOTFOUND_FLAG = False
def __init__(self, user: TVTimeUser, *args, **kwargs):
super().__init__(*args, **kwargs)
self.user = user
logger.info(f"User: {user}")
def start_requests(self):
logger.info(f"Starting Requests for {self.user.username}")
yield scrapy.FormRequest(
url="https://www.tvtime.com/signin",
formdata={
"username": self.user.username,
"password": self.user.password,
},
callback=self.logged_in,
)
def logged_in(self, response):
logger.debug(f"Logged in {self.user.username}")
script_content = response.selector.xpath(
'/html/head/script[contains(text(), "tvst.user")]/text()'
).get()
user_id = re.search(r'(?<= id:[ ]")[0-9]*', script_content).group(0)
if user_id == "":
logger.error("User not found")
self.USER_NOTFOUND_FLAG = True
raise CloseSpider("User not found")
user_id = int(user_id)
yield {"name": "id", "data": {"user_id": user_id}}
yield response.follow(TVTIME_TOWATCH_URL, self.parse_to_watch)
yield response.follow(TVTIME_UPCOMING_URL, self.parse_upcoming)
yield scrapy.Request(
f"{TVTIME_PROFILE_URL}/{user_id}/profile", self.parse_profile
)
def parse_to_watch(self, response):
result = {"name": "to-watch", "data": {}}
items = response.selector.xpath('//*[@id="to-watch"]/ul')
titles = response.selector.xpath('//*[@id="to-watch"]/h1')
for idx in range(len(items)):
title = titles[idx].xpath("./text()").get().strip()
result["data"][title] = {}
new_tags = items[idx].xpath('.//div[@class="new-label"]/text()').getall()
shows = items[idx].xpath(".//img/@alt").getall()
episodes = (
items[idx]
.xpath('.//div[@class="episode-details poster-details"]/h2/a/text()')
.getall()
)
for show, episode in zip(shows, episodes):
tag = False
if len(new_tags) > 0:
tag = True
new_tags.pop(0)
temp = {"episode": episode, "is_new": tag}
result["data"][title][show] = temp
# title_not_watched = items[0].xpath('.//img/@alt').getall()
# title_not_started = items[1].xpath('.//img/@alt').getall()
# result['data']['not-watched'] = title_not_watched
# result['data']['not-started'] = title_not_started
# script_content = response.selector.xpath('//div[@class="main-block-container"]/script/text()').get()
# data_content = re.search(r'(tvst.data = )(.*)(;)', script_content, re.DOTALL).group(2)
# data_content = data_content.replace('\"', ' ')
# data_content = data_content.replace("'", "")
# data_content = data_content.replace('evt:', '')
# data_content = data_content.replace('//', '')
# data_content = data_content.replace("\"", '"').replace("'[", "[").replace("]'", "]").replace('toWatchEpisodes', '"toWatchEpisodes"').replace('trendingShows', '"trendingShows"').replace('evt', '"evt"')
# data_content = data_content.replace('\\"', '').replace('\\', '').replace("'", '"')
# with open('data.json', 'w') as f:
# f.write(data_content)
# data = json.loads(data_content)
return result
def parse_upcoming(self, response):
result = {"name": "upcoming", "data": {}}
items = response.selector.xpath('//*[@id="upcoming-episodes"]/ul/li')
for item in items:
title = item.xpath(
'.//div[@class="episode-details poster-details"]/a/text()'
).get()
episode = item.xpath(
'//*[@id="upcoming-episodes"]/ul/li/div[@class="episode-details poster-details"]/h2/a/text()'
).get()
day = item.xpath('.//div[@class="overlay"]//ul/li/div/text()').get()
if title:
result["data"][title] = {episode: day}
return result
def parse_profile(self, response):
result = {"name": "profile", "data": {}}
script_content = response.selector.xpath(
'//div[@class="main-block-container"]/script/text()'
).get()
data_content = re.search(
r"(tvst.data = )(.*)(;)", script_content, re.DOTALL
).group(2)
data_content = (
data_content.replace("\"", '"')
.replace("shows", '"shows"', 1)
.replace("profile", '"profile"', 1)
.replace("'[", "[")
.replace("]'", "]")
.replace("'{", "{")
.replace("}'", "}")
.replace("\&", "")
)
with open("data.json", "w") as f:
f.write(data_content)
result["data"] = json.loads(data_content)
return result
|