Source code for the_data_packet.sources.base

"""Base classes for article sources.

This module defines the core data structures and interfaces for collecting
articles from various news sources. It provides a standardized way to
represent articles and implement source-specific collection logic.

The design supports:
- Multiple news sources with different scraping strategies
- Consistent article representation across sources
- Validation of article content quality
- Category-based article filtering
- Extensible source implementation

Architecture:
    Article: Data class representing a single news article
    ArticleSource: Abstract base class for implementing news sources

Current Sources:
    - WiredSource: Wired.com articles via RSS feeds
    - TechCrunchSource: TechCrunch.com articles via RSS feeds

Future Sources (extensible):
    - ArsTechnicaSource
    - HackerNewsSource
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Optional



[docs]
@dataclass
class Article:
    """Represents a single news article from any source.

    This data class provides a standardized representation of news articles
    regardless of their source. It includes validation methods to ensure
    articles have sufficient content for podcast generation.

    Attributes:
        title: Article headline/title. Required for all articles.
        content: Full article text content. Required and must be substantial.
        author: Article author name. Optional but recommended.
        url: Original article URL. Optional but useful for debugging.
        category: Article category (e.g., 'security', 'guide'). Optional.
        source: Source identifier (e.g., 'wired'). Optional but recommended.

    Content Requirements:
        - Title must be non-empty
        - Content must be at least 100 characters after stripping whitespace
        - Content should be clean text without HTML tags or navigation elements

    Example:
        article = Article(
            title="New Security Vulnerability Discovered",
            content="A critical security flaw has been found...",
            author="Jane Smith",
            url="https://example.com/article",
            category="security",
            source="wired"
        )

        if article.is_valid():
            # Process article for podcast generation
            pass
    """

    title: str
    content: str
    author: Optional[str] = None
    url: Optional[str] = None
    category: Optional[str] = None
    source: Optional[str] = None


[docs]
    def is_valid(self) -> bool:
        """Check if article has sufficient content for podcast generation.

        Validates that the article has:
        - Non-empty title
        - Non-empty content
        - Content length of at least 100 characters (after stripping)

        Returns:
            True if article meets minimum content requirements

        Example:
            if not article.is_valid():
                logger.warning(f"Skipping invalid article: {article.title}")
                continue
        """
        return bool(self.title and self.content and len(self.content.strip()) > 100)



[docs]
    def to_dict(self) -> Dict[str, Optional[str]]:
        """Convert article to dictionary representation.

        Returns:
            Dictionary with all article fields

        Example:
            article_data = article.to_dict()
            json.dump(article_data, file)
        """
        return {
            "title": self.title,
            "content": self.content,
            "author": self.author,
            "url": self.url,
            "category": self.category,
            "source": self.source,
        }





[docs]
class ArticleSource(ABC):
    """Abstract base class for implementing news article sources.

    This class defines the interface that all article sources must implement.
    It provides a consistent way to collect articles from different news
    websites while handling source-specific details in subclasses.

    Each source implementation should:
    - Define supported categories
    - Implement RSS feed or web scraping logic
    - Handle rate limiting and error recovery
    - Clean and validate article content
    - Return standardized Article objects

    Subclasses must implement:
        name: Property returning source identifier
        supported_categories: Property returning list of valid categories
        get_latest_article(): Method to get single latest article
        get_multiple_articles(): Method to get multiple articles

    Example Implementation:
        class ExampleSource(ArticleSource):
            @property
            def name(self) -> str:
                return "example"

            @property
            def supported_categories(self) -> List[str]:
                return ["tech", "science"]

            def get_latest_article(self, category: str) -> Article:
                # Implementation specific logic
                pass

    Usage:
        source = WiredSource()
        if "security" in source.supported_categories:
            article = source.get_latest_article("security")
            articles = source.get_multiple_articles("security", count=5)
    """

    @property
    @abstractmethod
    def name(self) -> str:
        """Source name identifier.

        Returns a unique string identifier for this source.
        Used in configuration, logging, and file naming.

        Returns:
            Source identifier (e.g., "wired", "techcrunch")
        """
        pass

    @property
    @abstractmethod
    def supported_categories(self) -> List[str]:
        """List of supported article categories for this source.

        Returns the categories this source can collect articles from.
        Categories should match the source's RSS feeds or section structure.

        Returns:
            List of category strings (e.g., ["security", "guide", "business"])
        """
        pass


[docs]
    @abstractmethod
    def get_latest_article(self, category: str) -> Article:
        """Get the latest article from a specific category.

        Args:
            category: Category to fetch from (must be in supported_categories)

        Returns:
            Latest Article instance from the category

        Raises:
            ScrapingError: If article collection fails
            ValidationError: If category is not supported
            NetworkError: If network request fails

        Example:
            try:
                article = source.get_latest_article("security")
                logger.info(f"Retrieved: {article.title}")
            except ValidationError:
                logger.error(f"Category 'invalid' not supported")
        """
        pass



[docs]
    @abstractmethod
    def get_multiple_articles(self, category: str, count: int) -> List[Article]:
        """Get multiple articles from a specific category.

        Args:
            category: Category to fetch from (must be in supported_categories)
            count: Maximum number of articles to return

        Returns:
            List of Article instances (may be fewer than count if unavailable)

        Raises:
            ScrapingError: If article collection fails
            ValidationError: If category is not supported or count is invalid
            NetworkError: If network request fails

        Example:
            articles = source.get_multiple_articles("guide", count=3)
            valid_articles = [a for a in articles if a.is_valid()]
        """
        pass



[docs]
    def validate_category(self, category: str) -> None:
        """Validate if a category is supported by this source.

        Args:
            category: Category to validate

        Raises:
            ValidationError: If category is not supported

        Example:
            source.validate_category("security")  # OK
            source.validate_category("invalid")   # Raises ValidationError
        """
        from ..core.exceptions import ValidationError

        if category not in self.supported_categories:
            raise ValidationError(
                f"Category '{category}' not supported by {self.name}. "
                f"Supported categories: {', '.join(self.supported_categories)}"
            )