Source code for the_data_packet.sources.wired

"""Wired.com article source implementation.

This module implements article collection from Wired.com using RSS feeds and
web scraping. Wired.com provides RSS feeds for different categories that contain
recent article URLs, which are then scraped for full content.

Features:
    - RSS feed-based article discovery
    - Multiple category support (security, guides, business, science, AI)
    - Robust content extraction with fallback methods
    - Content cleaning and validation
    - Error handling for network issues and malformed content

RSS Feed Strategy:
    1. Fetch category-specific RSS feed
    2. Parse feed to extract article URLs
    3. Scrape individual articles for full content
    4. Clean and validate extracted content
    5. Return standardized Article objects

Content Extraction:
    - Primary: Article body containers and paragraph tags
    - Fallback: Main content areas and text containers
    - Cleaning: Remove navigation, ads, and boilerplate text
    - Validation: Ensure sufficient content length

Supported Categories:
    - security: Security and cybersecurity articles
    - guide: How-to guides and tutorials
    - business: Business and industry news
    - science: Science and technology research
    - ai: Artificial intelligence and machine learning

Rate Limiting:
    - Respectful delays between requests
    - Connection reuse via HTTP session
    - Proper User-Agent identification

Example Usage:
    source = WiredSource()

    # Get latest security article
    article = source.get_latest_article(\"security\")

    # Get multiple guide articles
    articles = source.get_multiple_articles(\"guide\", count=3)

    # Check supported categories
    if \"ai\" in source.supported_categories:
        ai_articles = source.get_multiple_articles(\"ai\", count=5)
"""

import re
from typing import List, Optional

import feedparser
from bs4 import BeautifulSoup

from the_data_packet.core.exceptions import NetworkError, ScrapingError
from the_data_packet.core.logging import get_logger
from the_data_packet.sources.base import Article, ArticleSource
from the_data_packet.utils.http import HTTPClient

logger = get_logger(__name__)


[docs] class WiredSource(ArticleSource): """Article source for Wired.com."""
[docs] def __init__(self) -> None: """Initialize Wired source.""" self.http_client = HTTPClient() logger.info("Initialized Wired source")
# RSS feed URLs for different categories RSS_FEEDS = { "security": "https://www.wired.com/feed/category/security/latest/rss", "science": "https://www.wired.com/feed/category/science/latest/rss", "ai": "https://www.wired.com/feed/tag/ai/latest/rss", } # Content patterns to skip during extraction SKIP_PATTERNS = [ "subscribe to wired", "most popular", "related stories", "advertisement", "get wired", "sign up", "newsletter", ] @property def name(self) -> str: """Source name identifier.""" return "wired" @property def supported_categories(self) -> List[str]: """List of supported categories.""" return list(self.RSS_FEEDS.keys())
[docs] def get_latest_article(self, category: str) -> Article: """Get the latest article from a category.""" self.validate_category(category) logger.info(f"Fetching latest {category} article from Wired") try: # Get latest article URL from RSS url = self._get_latest_url_from_rss(category) # Fetch and extract article content article = self._extract_article(url, category) if not article.is_valid(): raise ScrapingError("Extracted article is not valid: missing content") logger.info(f"Successfully extracted article: {article.title}") return article except Exception as e: if isinstance(e, (ScrapingError, NetworkError)): raise raise ScrapingError(f"Failed to get latest article from {category}: {e}")
[docs] def get_multiple_articles(self, category: str, count: int) -> List[Article]: """Get multiple articles from a category.""" self.validate_category(category) logger.info(f"Fetching {count} {category} articles from Wired") try: # Get multiple URLs from RSS urls = self._get_urls_from_rss(category, count) articles = [] for url in urls: try: article = self._extract_article(url, category) if article.is_valid(): articles.append(article) else: logger.warning(f"Skipping invalid article: {url}") except Exception as e: logger.warning(f"Failed to extract article {url}: {e}") continue if not articles: raise ScrapingError(f"No valid articles found in {category}") logger.info(f"Successfully extracted {len(articles)} articles") return articles except Exception as e: if isinstance(e, ScrapingError): raise raise ScrapingError(f"Failed to get articles from {category}: {e}")
def _get_latest_url_from_rss(self, category: str) -> str: """Get the latest article URL from RSS feed.""" urls = self._get_urls_from_rss(category, 1) if not urls: raise ScrapingError(f"No articles found in RSS feed for {category}") return urls[0] def _get_urls_from_rss(self, category: str, count: int) -> List[str]: """Get article URLs from RSS feed.""" rss_url = self.RSS_FEEDS[category] logger.debug(f"Fetching RSS feed: {rss_url}") try: # Parse RSS feed feed = feedparser.parse(rss_url) if not feed.entries: raise ScrapingError(f"No entries found in RSS feed for {category}") # Extract URLs urls = [] for entry in feed.entries[:count]: if hasattr(entry, "link"): urls.append(entry.link) if not urls: raise ScrapingError(f"No valid URLs found in RSS feed for {category}") logger.debug(f"Found {len(urls)} article URLs") return urls except Exception as e: if isinstance(e, ScrapingError): raise raise NetworkError(f"Failed to fetch RSS feed {rss_url}: {e}") def _extract_article(self, url: str, category: str) -> Article: """Extract article content from a Wired article page.""" logger.debug(f"Extracting article: {url}") try: # Fetch article page soup = self._fetch_page(url) # Extract article data title = self._extract_title(soup) author = self._extract_author(soup) content = self._extract_content(soup) return Article( title=title, content=content, author=author, url=url, category=category, source=self.name, ) except Exception as e: if isinstance(e, (NetworkError, ScrapingError)): raise raise ScrapingError(f"Failed to extract article from {url}: {e}") def _fetch_page(self, url: str) -> BeautifulSoup: """Fetch and parse a web page.""" soup = self.http_client.get_soup(url) return soup def _extract_title(self, soup: BeautifulSoup) -> str: """Extract article title.""" # Try multiple selectors for title selectors = [ "h1[data-testid='ContentHeaderHed']", "h1.ContentHeaderHed", "h1.entry-title", "h1", "title", ] for selector in selectors: element = soup.select_one(selector) if element and element.get_text(strip=True): title = element.get_text(strip=True) # Clean title title = re.sub(r"\s+", " ", title) if title and len(title) > 5: # Basic validation return title raise ScrapingError("Could not extract article title") def _extract_author(self, soup: BeautifulSoup) -> Optional[str]: """Extract article author.""" # Try multiple selectors for author selectors = [ "[data-testid='ContentHeaderAccreditation'] a", ".ContentHeaderAccreditation a", ".byline a", ".author a", "[rel='author']", ] for selector in selectors: element = soup.select_one(selector) if element and element.get_text(strip=True): author = element.get_text(strip=True) author = re.sub(r"\s+", " ", author) if author: return author return None def _extract_content(self, soup: BeautifulSoup) -> str: """Extract article content.""" # Try multiple selectors for content selectors = [ "[data-testid='ArticleBodyWrapper']", ".ArticleBodyWrapper", ".content-body", ".entry-content", "article", ] content_element = None for selector in selectors: element = soup.select_one(selector) if element: content_element = element break if not content_element: raise ScrapingError("Could not find article content") # Extract text and clean it paragraphs = [] for p in content_element.find_all(["p", "div"], recursive=True): text = p.get_text(strip=True) if text and len(text) > 20: # Filter out short snippets # Skip unwanted content text_lower = text.lower() if any(pattern in text_lower for pattern in self.SKIP_PATTERNS): continue paragraphs.append(text) if not paragraphs: raise ScrapingError("No content paragraphs found") content = "\n\n".join(paragraphs) # Final cleaning content = re.sub(r"\s+", " ", content) content = content.strip() if len(content) < 100: raise ScrapingError("Article content too short") return content