Source code for the_data_packet.sources.wired

"""Wired.com article source implementation.

This module implements article collection from Wired.com using RSS feeds and
web scraping. Wired.com provides RSS feeds for different categories that contain
recent article URLs, which are then scraped for full content.

Features:
    - RSS feed-based article discovery
    - Multiple category support (security, guides, business, science, AI)
    - Robust content extraction with fallback methods
    - Content cleaning and validation
    - Error handling for network issues and malformed content

RSS Feed Strategy:
    1. Fetch category-specific RSS feed
    2. Parse feed to extract article URLs
    3. Scrape individual articles for full content
    4. Clean and validate extracted content
    5. Return standardized Article objects

Content Extraction:
    - Primary: Article body containers and paragraph tags
    - Fallback: Main content areas and text containers
    - Cleaning: Remove navigation, ads, and boilerplate text
    - Validation: Ensure sufficient content length

Supported Categories:
    - security: Security and cybersecurity articles
    - guide: How-to guides and tutorials
    - business: Business and industry news
    - science: Science and technology research
    - ai: Artificial intelligence and machine learning

Rate Limiting:
    - Respectful delays between requests
    - Connection reuse via HTTP session
    - Proper User-Agent identification

Example Usage:
    source = WiredSource()

    # Get latest security article
    article = source.get_latest_article(\"security\")

    # Get multiple guide articles
    articles = source.get_multiple_articles(\"guide\", count=3)

    # Check supported categories
    if \"ai\" in source.supported_categories:
        ai_articles = source.get_multiple_articles(\"ai\", count=5)
"""

import re
from typing import List, Optional

import feedparser
from bs4 import BeautifulSoup

from the_data_packet.core.exceptions import NetworkError, ScrapingError
from the_data_packet.core.logging import get_logger
from the_data_packet.sources.base import Article, ArticleSource
from the_data_packet.utils.http import HTTPClient

logger = get_logger(__name__)



[docs]
class WiredSource(ArticleSource):
    """Article source for Wired.com."""


[docs]
    def __init__(self) -> None:
        """Initialize Wired source."""
        self.http_client = HTTPClient()
        logger.info("Initialized Wired source")


    # RSS feed URLs for different categories
    RSS_FEEDS = {
        "security": "https://www.wired.com/feed/category/security/latest/rss",
        "science": "https://www.wired.com/feed/category/science/latest/rss",
        "ai": "https://www.wired.com/feed/tag/ai/latest/rss",
    }

    # Content patterns to skip during extraction
    SKIP_PATTERNS = [
        "subscribe to wired",
        "most popular",
        "related stories",
        "advertisement",
        "get wired",
        "sign up",
        "newsletter",
    ]

    @property
    def name(self) -> str:
        """Source name identifier."""
        return "wired"

    @property
    def supported_categories(self) -> List[str]:
        """List of supported categories."""
        return list(self.RSS_FEEDS.keys())


[docs]
    def get_latest_article(self, category: str) -> Article:
        """Get the latest article from a category."""
        self.validate_category(category)

        logger.info(f"Fetching latest {category} article from Wired")

        try:
            # Get latest article URL from RSS
            url = self._get_latest_url_from_rss(category)

            # Fetch and extract article content
            article = self._extract_article(url, category)

            if not article.is_valid():
                raise ScrapingError("Extracted article is not valid: missing content")

            logger.info(f"Successfully extracted article: {article.title}")
            return article

        except Exception as e:
            if isinstance(e, (ScrapingError, NetworkError)):
                raise
            raise ScrapingError(f"Failed to get latest article from {category}: {e}")



[docs]
    def get_multiple_articles(self, category: str, count: int) -> List[Article]:
        """Get multiple articles from a category."""
        self.validate_category(category)

        logger.info(f"Fetching {count} {category} articles from Wired")

        try:
            # Get multiple URLs from RSS
            urls = self._get_urls_from_rss(category, count)

            articles = []
            for url in urls:
                try:
                    article = self._extract_article(url, category)
                    if article.is_valid():
                        articles.append(article)
                    else:
                        logger.warning(f"Skipping invalid article: {url}")
                except Exception as e:
                    logger.warning(f"Failed to extract article {url}: {e}")
                    continue

            if not articles:
                raise ScrapingError(f"No valid articles found in {category}")

            logger.info(f"Successfully extracted {len(articles)} articles")
            return articles

        except Exception as e:
            if isinstance(e, ScrapingError):
                raise
            raise ScrapingError(f"Failed to get articles from {category}: {e}")


    def _get_latest_url_from_rss(self, category: str) -> str:
        """Get the latest article URL from RSS feed."""
        urls = self._get_urls_from_rss(category, 1)
        if not urls:
            raise ScrapingError(f"No articles found in RSS feed for {category}")
        return urls[0]

    def _get_urls_from_rss(self, category: str, count: int) -> List[str]:
        """Get article URLs from RSS feed."""
        rss_url = self.RSS_FEEDS[category]

        logger.debug(f"Fetching RSS feed: {rss_url}")

        try:
            # Parse RSS feed
            feed = feedparser.parse(rss_url)

            if not feed.entries:
                raise ScrapingError(f"No entries found in RSS feed for {category}")

            # Extract URLs
            urls = []
            for entry in feed.entries[:count]:
                if hasattr(entry, "link"):
                    urls.append(entry.link)

            if not urls:
                raise ScrapingError(f"No valid URLs found in RSS feed for {category}")

            logger.debug(f"Found {len(urls)} article URLs")
            return urls

        except Exception as e:
            if isinstance(e, ScrapingError):
                raise
            raise NetworkError(f"Failed to fetch RSS feed {rss_url}: {e}")

    def _extract_article(self, url: str, category: str) -> Article:
        """Extract article content from a Wired article page."""
        logger.debug(f"Extracting article: {url}")

        try:
            # Fetch article page
            soup = self._fetch_page(url)

            # Extract article data
            title = self._extract_title(soup)
            author = self._extract_author(soup)
            content = self._extract_content(soup)

            return Article(
                title=title,
                content=content,
                author=author,
                url=url,
                category=category,
                source=self.name,
            )

        except Exception as e:
            if isinstance(e, (NetworkError, ScrapingError)):
                raise
            raise ScrapingError(f"Failed to extract article from {url}: {e}")

    def _fetch_page(self, url: str) -> BeautifulSoup:
        """Fetch and parse a web page."""
        soup = self.http_client.get_soup(url)
        return soup

    def _extract_title(self, soup: BeautifulSoup) -> str:
        """Extract article title."""
        # Try multiple selectors for title
        selectors = [
            "h1[data-testid='ContentHeaderHed']",
            "h1.ContentHeaderHed",
            "h1.entry-title",
            "h1",
            "title",
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element and element.get_text(strip=True):
                title = element.get_text(strip=True)
                # Clean title
                title = re.sub(r"\s+", " ", title)
                if title and len(title) > 5:  # Basic validation
                    return title

        raise ScrapingError("Could not extract article title")

    def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article author."""
        # Try multiple selectors for author
        selectors = [
            "[data-testid='ContentHeaderAccreditation'] a",
            ".ContentHeaderAccreditation a",
            ".byline a",
            ".author a",
            "[rel='author']",
        ]

        for selector in selectors:
            element = soup.select_one(selector)
            if element and element.get_text(strip=True):
                author = element.get_text(strip=True)
                author = re.sub(r"\s+", " ", author)
                if author:
                    return author

        return None

    def _extract_content(self, soup: BeautifulSoup) -> str:
        """Extract article content."""
        # Try multiple selectors for content
        selectors = [
            "[data-testid='ArticleBodyWrapper']",
            ".ArticleBodyWrapper",
            ".content-body",
            ".entry-content",
            "article",
        ]

        content_element = None
        for selector in selectors:
            element = soup.select_one(selector)
            if element:
                content_element = element
                break

        if not content_element:
            raise ScrapingError("Could not find article content")

        # Extract text and clean it
        paragraphs = []
        for p in content_element.find_all(["p", "div"], recursive=True):
            text = p.get_text(strip=True)
            if text and len(text) > 20:  # Filter out short snippets
                # Skip unwanted content
                text_lower = text.lower()
                if any(pattern in text_lower for pattern in self.SKIP_PATTERNS):
                    continue
                paragraphs.append(text)

        if not paragraphs:
            raise ScrapingError("No content paragraphs found")

        content = "\n\n".join(paragraphs)

        # Final cleaning
        content = re.sub(r"\s+", " ", content)
        content = content.strip()

        if len(content) < 100:
            raise ScrapingError("Article content too short")

        return content