@task(retries=3, retry_delay_seconds=2)
def fetch_html(url: str) -> str:
"""Download page HTML (with retries).
This is just a regular requests call - Prefect adds retry logic
without changing how we write the code."""
print(f"Fetching {url} …")
response = requests.get(url, timeout=10)
response.raise_for_status()
return response.text
@task
def parse_article(html: str) -> str:
"""Extract article text, skipping code blocks.
Regular BeautifulSoup parsing with standard Python string operations.
Prefect adds observability without changing the logic."""
soup = BeautifulSoup(html, "html.parser")
# Find main content - just regular BeautifulSoup
article = soup.find("article") or soup.find("main")
if not article:
return ""
# Standard Python all the way
for code in article.find_all(["pre", "code"]):
code.decompose()
content = []
for elem in article.find_all(["h1", "h2", "h3", "p", "ul", "ol", "li"]):
text = elem.get_text().strip()
if not text:
continue
if elem.name.startswith("h"):
content.extend(["\n" + "=" * 80, text.upper(), "=" * 80 + "\n"])
else:
content.extend([text, ""])
return "\n".join(content)