Skip to content

Commit

Permalink
Make count_pages also return the current page
Browse files Browse the repository at this point in the history
This is necessary during post downloads to determine whether we need to
download an additional page to get the full thread context.
  • Loading branch information
rossjrw committed Jan 28, 2024
1 parent 02c47ad commit 6f22278
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 64 deletions.
62 changes: 37 additions & 25 deletions notifier/parsethread.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@


def parse_thread_meta(thread: Tag) -> RawThreadMeta:
"""Parse the meta info of a thread to return forum category ID,
category name, and thread title.
"""Parse the meta info of a thread to return forum category ID, category name, and thread title.
:param thread: The thread, as soup. Expected to start at
.forum-thread-box, which is what the ForumViewThreadModule returns.
:param thread: The thread, as soup. Expected to start at .forum-thread-box, which is what the ForumViewThreadModule returns.
Information returned is independent of the thread page passed to this function, except the returned value will include the active page number.
"""
breadcrumbs = cast(Tag, thread.find(class_="forum-breadcrumbs"))
category_link = list(cast(Iterable[Tag], breadcrumbs.find_all("a")))[-1]
Expand All @@ -32,13 +32,15 @@ def parse_thread_meta(thread: Tag) -> RawThreadMeta:
created_timestamp = get_timestamp(statistics)
if created_timestamp is None:
raise ValueError("No timestamp for thread")
page_count, current_page = count_pages(thread)
return {
"category_id": category_id,
"category_name": category_name,
"title": list(breadcrumbs.stripped_strings)[-1].strip(" »"),
"creator_username": creator_username,
"created_timestamp": created_timestamp,
"page_count": count_pages(thread),
"page_count": page_count,
"current_page": current_page,
}


Expand Down Expand Up @@ -202,31 +204,41 @@ def get_timestamp(element: Tag) -> Optional[int]:
return posted_timestamp


def count_pages(module_result: Union[str, Tag]) -> int:
"""Counts the pages in a Wikidot module.
def count_pages(module_result: Union[str, Tag]) -> Tuple[int, Optional[int]]:
"""Counts the pages in a Wikidot module and gets the current page.
Takes the HTML (as text or soup) of the output of any module that can
return with a pager, and reads the text of the last page button to get
the page number.
Takes the HTML (as text or soup) of the output of any module that can return with a pager, and reads the text of the last page button to get the page number.
If a pager is not present, the page count is assumed to be 1.
It's possible that no page is marked as the current one (no idea what causes this, but I've seen it happen).
This process only works for modules that return pagers of a fixed length (the only one that I know of that does not do this is page history).
This process only works for modules that return pagers of a fixed
length (the only one that I know of that does not do this is page
history).
Returns a tuple of the number of pages and the current page.
"""
if isinstance(module_result, str):
module_result = BeautifulSoup(module_result, "html.parser")

page_count = 1
current_page = None

page_selectors = cast(Optional[Tag], module_result.find(class_="pager"))
if not page_selectors:
# There are no page selectors if there is only one page
return 1
# The final page selector is the last one with numeric text. It may be
# .target (clickable) or .current (unclickable). Non-numeric text
# indicates e.g. a 'next' button
for selector in reversed(page_selectors.contents):
try:
return int(selector.get_text())
except ValueError:
continue
return 1
# Note that there are no page selectors if there is only one page

if page_selectors:
current_selector = page_selectors.find(class_="current")
if current_selector:
try:
current_page = int(current_selector.get_text())
except ValueError:
pass

# The final page selector is the last one with numeric text. It may be .target (clickable) or .current (unclickable). Non-numeric text indicates e.g. a 'next' button
for selector in reversed(page_selectors.contents):
try:
page_count = int(selector.get_text())
break
except ValueError:
continue

return page_count, current_page
1 change: 1 addition & 0 deletions notifier/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class RawThreadMeta(TypedDict):
creator_username: Optional[str]
created_timestamp: int
page_count: int
current_page: Optional[int]


class RawPost(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion notifier/wikiconnection.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def paginated_module(
)
first_page = self.module(wiki, module_name, **module_kwargs)
yield first_page
page_count = count_pages(first_page["body"])
page_count, _ = count_pages(first_page["body"])
# Iterate through the remaining pages
# Start from the starting index plus one, because the first page
# was already done
Expand Down
124 changes: 86 additions & 38 deletions tests/test_parsethread.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
from bs4 import BeautifulSoup
from bs4.element import Tag

from notifier.parsethread import get_user_from_nametag
from notifier.parsethread import count_pages, get_user_from_nametag

# pylint:disable=missing-function-docstring
# pylint:disable=missing-class-docstring


def test_get_user_from_nametag() -> None:
Expand All @@ -27,41 +30,86 @@ def __post_init__(self) -> None:
if self.expected_username is not None:
assert len(self.expected_username) <= 20

user_tags = [
UserTag(
"Anonymous user with visible IP",
"""<span class="printuser anonymous"><a href="javascript:;" onclick="WIKIDOT.page.listeners.anonymousUserInfo('75.142.217.5'); return false;"><img class="small" src="https://www.wikidot.com/common--images/avatars/default/a16.png" alt=""></a><a href="javascript:;" onclick="WIKIDOT.page.listeners.anonymousUserInfo('75.142.217.5'); return false;">Anonymous <span class="ip">(75.142.217.x)</span></a></span>""",
None,
None,
),
UserTag(
"Guest user",
"""<span class="printuser avatarhover"><a href="javascript:;"><img class="small" src="https://secure.gravatar.com/avatar.php?gravatar_id=d0f7d0914b3a679ead94c8a16168f63f&amp;default=https://www.wikidot.com/common--images/avatars/default/a16.png&amp;size=16" alt=""></a>chelonianmobile (guest)</span>""",
None,
"chelonianmobile",
),
UserTag(
"Deleted user",
"""<span class="printuser deleted" data-id="462110"><img class="small" src="https://www.wikidot.com/common--images/avatars/default/a16.png" alt="">(account deleted)</span>""",
"462110",
None,
),
UserTag(
"Normal user (from a forum post)",
"""<span class="printuser avatarhover"><a href="http://www.wikidot.com/user:info/croquembouche" onclick="WIKIDOT.page.listeners.userInfo(2893766); return false;"><img class="small" src="https://www.wikidot.com/avatar.php?userid=2893766&amp;amp;size=small&amp;amp;timestamp=1686573582" alt="Croquembouche" style="background-image:url(https://www.wikidot.com/userkarma.php?u=2893766)"></a><a href="http://www.wikidot.com/user:info/croquembouche" onclick="WIKIDOT.page.listeners.userInfo(2893766); return false;">Croquembouche</a></span>""",
"2893766",
"Croquembouche",
),
UserTag(
"System user",
"""<span class="printuser">Wikidot</span>""",
None,
"Wikidot",
),
]
def test(self) -> None:
assert (
self.expected_user_id,
self.expected_username,
) == get_user_from_nametag(self.tag)

UserTag(
"Anonymous user with visible IP",
"""<span class="printuser anonymous"><a href="javascript:;" onclick="WIKIDOT.page.listeners.anonymousUserInfo('75.142.217.5'); return false;"><img class="small" src="https://www.wikidot.com/common--images/avatars/default/a16.png" alt=""></a><a href="javascript:;" onclick="WIKIDOT.page.listeners.anonymousUserInfo('75.142.217.5'); return false;">Anonymous <span class="ip">(75.142.217.x)</span></a></span>""",
None,
None,
).test()
UserTag(
"Guest user",
"""<span class="printuser avatarhover"><a href="javascript:;"><img class="small" src="https://secure.gravatar.com/avatar.php?gravatar_id=d0f7d0914b3a679ead94c8a16168f63f&amp;default=https://www.wikidot.com/common--images/avatars/default/a16.png&amp;size=16" alt=""></a>chelonianmobile (guest)</span>""",
None,
"chelonianmobile",
).test()
UserTag(
"Deleted user",
"""<span class="printuser deleted" data-id="462110"><img class="small" src="https://www.wikidot.com/common--images/avatars/default/a16.png" alt="">(account deleted)</span>""",
"462110",
None,
).test()
UserTag(
"Normal user (from a forum post)",
"""<span class="printuser avatarhover"><a href="http://www.wikidot.com/user:info/croquembouche" onclick="WIKIDOT.page.listeners.userInfo(2893766); return false;"><img class="small" src="https://www.wikidot.com/avatar.php?userid=2893766&amp;amp;size=small&amp;amp;timestamp=1686573582" alt="Croquembouche" style="background-image:url(https://www.wikidot.com/userkarma.php?u=2893766)"></a><a href="http://www.wikidot.com/user:info/croquembouche" onclick="WIKIDOT.page.listeners.userInfo(2893766); return false;">Croquembouche</a></span>""",
"2893766",
"Croquembouche",
).test()
UserTag(
"System user",
"""<span class="printuser">Wikidot</span>""",
None,
"Wikidot",
).test()


def test_count_pages() -> None:
@dataclass
class Pager:
html: str
expected_current_page: int
expected_page_count: int

def test(self) -> None:
assert (
self.expected_page_count,
self.expected_current_page,
) == count_pages(self.html)

for user_tag in user_tags:
assert (
user_tag.expected_user_id,
user_tag.expected_username,
) == get_user_from_nametag(user_tag.tag)
Pager(
"""<div class="forum-category-box">
<div class="pager">
<span class="pager-no">page 1 of 263</span>
<span class="current">1</span>
<span class="target"><a href="/forum/c-00000/p/2">2</a></span>
<span class="target"><a href="/forum/c-00000/p/3">3</a></span>
<span class="dots">...</span>
<span class="target"><a href="/forum/c-00000/p/262">262</a></span>
<span class="target"><a href="/forum/c-00000/p/263">263</a></span>
<span class="target"><a href="/forum/c-00000/p/2">next »</a></span>
</div>
</div>""",
1,
263,
).test()
Pager(
"""<div class="pager">
<span class="pager-no">page 262 of 263</span>
<span class="target"><a href="/forum/c-891087/p/261">« previous</a></span>
<span class="target"><a href="/forum/c-891087/p/1">1</a></span>
<span class="target"><a href="/forum/c-891087/p/2">2</a></span>
<span class="dots">...</span>
<span class="target"><a href="/forum/c-891087/p/260">260</a></span>
<span class="target"><a href="/forum/c-891087/p/261">261</a></span>
<span class="current">262</span>
<span class="target"><a href="/forum/c-891087/p/263">263</a></span>
<span class="target"><a href="/forum/c-891087/p/263">next »</a></span>
</div>""",
262,
263,
).test()

0 comments on commit 6f22278

Please sign in to comment.