Skip to content

Commit

Permalink
updating W3C message parser to address new site design. Fixes #601
Browse files Browse the repository at this point in the history
  • Loading branch information
sbenthall committed Jul 24, 2023
1 parent 6a0ec35 commit 20b940d
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions bigbang/ingress/w3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,10 @@ def _get_header_from_html(self, soup: BeautifulSoup) -> Dict[str, str]:
soup : HTML code from which the Email header can be obtained.
"""
header = {
"message-ID": "#message-id",
"Date": "#date",
"To": "#to",
"Cc": "#cc",
"message-ID": ".message-id",
"Date": ".date",
"To": ".to",
"Cc": ".cc",
}
for key, value in header.items():
try:
Expand All @@ -116,9 +116,9 @@ def _get_header_from_html(self, soup: BeautifulSoup) -> Dict[str, str]:
continue
header["Subject"] = text_for_selector(soup, "h1")

from_text = parse_dfn_header(text_for_selector(soup, "#from"))
from_text = parse_dfn_header(text_for_selector(soup, ".from"))
from_name = from_text.split("<")[0].strip()
from_address = text_for_selector(soup, "#from a")
from_address = text_for_selector(soup, ".from a")
header["From"] = email.utils.formataddr(
(from_name, email.header.Header(from_address).encode())
)
Expand All @@ -144,7 +144,7 @@ def _get_body_from_html(
"""
# TODO re-write using email.parser.Parser
try:
return text_for_selector(soup, "#body")
return text_for_selector(soup, ".body")
except Exception:
logger.exception(f"The message body of {url} could not be loaded.")
return None
Expand Down

0 comments on commit 20b940d

Please sign in to comment.