Source code for the_data_packet.sources.base

"""Base classes for article sources.

This module defines the core data structures and interfaces for collecting
articles from various news sources. It provides a standardized way to
represent articles and implement source-specific collection logic.

The design supports:
- Multiple news sources with different scraping strategies
- Consistent article representation across sources
- Validation of article content quality
- Category-based article filtering
- Extensible source implementation

Architecture:
    Article: Data class representing a single news article
    ArticleSource: Abstract base class for implementing news sources

Current Sources:
    - WiredSource: Wired.com articles via RSS feeds
    - TechCrunchSource: TechCrunch.com articles via RSS feeds

Future Sources (extensible):
    - ArsTechnicaSource
    - HackerNewsSource
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Optional


[docs] @dataclass class Article: """Represents a single news article from any source. This data class provides a standardized representation of news articles regardless of their source. It includes validation methods to ensure articles have sufficient content for podcast generation. Attributes: title: Article headline/title. Required for all articles. content: Full article text content. Required and must be substantial. author: Article author name. Optional but recommended. url: Original article URL. Optional but useful for debugging. category: Article category (e.g., 'security', 'guide'). Optional. source: Source identifier (e.g., 'wired'). Optional but recommended. Content Requirements: - Title must be non-empty - Content must be at least 100 characters after stripping whitespace - Content should be clean text without HTML tags or navigation elements Example: article = Article( title="New Security Vulnerability Discovered", content="A critical security flaw has been found...", author="Jane Smith", url="https://example.com/article", category="security", source="wired" ) if article.is_valid(): # Process article for podcast generation pass """ title: str content: str author: Optional[str] = None url: Optional[str] = None category: Optional[str] = None source: Optional[str] = None
[docs] def is_valid(self) -> bool: """Check if article has sufficient content for podcast generation. Validates that the article has: - Non-empty title - Non-empty content - Content length of at least 100 characters (after stripping) Returns: True if article meets minimum content requirements Example: if not article.is_valid(): logger.warning(f"Skipping invalid article: {article.title}") continue """ return bool(self.title and self.content and len(self.content.strip()) > 100)
[docs] def to_dict(self) -> Dict[str, Optional[str]]: """Convert article to dictionary representation. Returns: Dictionary with all article fields Example: article_data = article.to_dict() json.dump(article_data, file) """ return { "title": self.title, "content": self.content, "author": self.author, "url": self.url, "category": self.category, "source": self.source, }
[docs] class ArticleSource(ABC): """Abstract base class for implementing news article sources. This class defines the interface that all article sources must implement. It provides a consistent way to collect articles from different news websites while handling source-specific details in subclasses. Each source implementation should: - Define supported categories - Implement RSS feed or web scraping logic - Handle rate limiting and error recovery - Clean and validate article content - Return standardized Article objects Subclasses must implement: name: Property returning source identifier supported_categories: Property returning list of valid categories get_latest_article(): Method to get single latest article get_multiple_articles(): Method to get multiple articles Example Implementation: class ExampleSource(ArticleSource): @property def name(self) -> str: return "example" @property def supported_categories(self) -> List[str]: return ["tech", "science"] def get_latest_article(self, category: str) -> Article: # Implementation specific logic pass Usage: source = WiredSource() if "security" in source.supported_categories: article = source.get_latest_article("security") articles = source.get_multiple_articles("security", count=5) """ @property @abstractmethod def name(self) -> str: """Source name identifier. Returns a unique string identifier for this source. Used in configuration, logging, and file naming. Returns: Source identifier (e.g., "wired", "techcrunch") """ pass @property @abstractmethod def supported_categories(self) -> List[str]: """List of supported article categories for this source. Returns the categories this source can collect articles from. Categories should match the source's RSS feeds or section structure. Returns: List of category strings (e.g., ["security", "guide", "business"]) """ pass
[docs] @abstractmethod def get_latest_article(self, category: str) -> Article: """Get the latest article from a specific category. Args: category: Category to fetch from (must be in supported_categories) Returns: Latest Article instance from the category Raises: ScrapingError: If article collection fails ValidationError: If category is not supported NetworkError: If network request fails Example: try: article = source.get_latest_article("security") logger.info(f"Retrieved: {article.title}") except ValidationError: logger.error(f"Category 'invalid' not supported") """ pass
[docs] @abstractmethod def get_multiple_articles(self, category: str, count: int) -> List[Article]: """Get multiple articles from a specific category. Args: category: Category to fetch from (must be in supported_categories) count: Maximum number of articles to return Returns: List of Article instances (may be fewer than count if unavailable) Raises: ScrapingError: If article collection fails ValidationError: If category is not supported or count is invalid NetworkError: If network request fails Example: articles = source.get_multiple_articles("guide", count=3) valid_articles = [a for a in articles if a.is_valid()] """ pass
[docs] def validate_category(self, category: str) -> None: """Validate if a category is supported by this source. Args: category: Category to validate Raises: ValidationError: If category is not supported Example: source.validate_category("security") # OK source.validate_category("invalid") # Raises ValidationError """ from ..core.exceptions import ValidationError if category not in self.supported_categories: raise ValidationError( f"Category '{category}' not supported by {self.name}. " f"Supported categories: {', '.join(self.supported_categories)}" )