Source code for the_data_packet.sources.techcrunch

"""TechCrunch.com article source implementation.

This module implements article collection from TechCrunch.com using RSS feeds and
web scraping. TechCrunch provides RSS feeds for different categories that contain
recent article URLs, which are then scraped for full content.

Features:
    - RSS feed-based article discovery
    - Multiple category support (artificial-intelligence, security)
    - Robust content extraction with fallback methods
    - Content cleaning and validation
    - Error handling for network issues and malformed content

RSS Feed Strategy:
    1. Fetch category-specific RSS feed
    2. Parse feed to extract article URLs
    3. Scrape individual articles for full content
    4. Clean and validate extracted content
    5. Return standardized Article objects

Content Extraction:
    - Primary: Article body containers and paragraph tags
    - Fallback: Main content areas and text containers
    - Cleaning: Remove navigation, ads, and boilerplate text
    - Validation: Ensure sufficient content length

Supported Categories:
    - ai: Artificial intelligence and machine learning articles
    - security: Security and cybersecurity articles

Rate Limiting:
    - Respectful delays between requests
    - Connection reuse via HTTP session
    - Proper User-Agent identification

Example Usage:
    source = TechCrunchSource()

    # Get latest AI article
    article = source.get_latest_article("ai")

    # Get multiple security articles
    articles = source.get_multiple_articles("security", count=3)

    # Check supported categories
    if "ai" in source.supported_categories:
        ai_articles = source.get_multiple_articles("ai", count=5)
"""

import re
import time
from typing import List, Optional

import feedparser
from bs4 import BeautifulSoup

from the_data_packet.core.exceptions import NetworkError, ScrapingError
from the_data_packet.core.logging import get_logger
from the_data_packet.sources.base import Article, ArticleSource
from the_data_packet.utils.http import HTTPClient

logger = get_logger(__name__)



[docs]
class TechCrunchSource(ArticleSource):
    """Article source for TechCrunch.com."""


[docs]
    def __init__(self) -> None:
        """Initialize TechCrunch source."""
        self.http_client = HTTPClient()
        logger.info("Initialized TechCrunch source")


    # RSS feed URLs for different categories
    RSS_FEEDS = {
        "ai": "https://techcrunch.com/category/artificial-intelligence/feed/",
        "security": "https://techcrunch.com/category/security/feed/",
    }

    # Content patterns to skip during extraction
    SKIP_PATTERNS = [
        "subscribe to techcrunch",
        "most popular",
        "related articles",
        "advertisement",
        "get techcrunch",
        "sign up",
        "newsletter",
        "techcrunch+",
        "techcrunch disrupt",
        "more techcrunch",
        "follow us",
        "share this article",
    ]

    @property
    def name(self) -> str:
        """Source name identifier."""
        return "techcrunch"

    @property
    def supported_categories(self) -> List[str]:
        """List of supported categories."""
        return list(self.RSS_FEEDS.keys())


[docs]
    def get_latest_article(self, category: str) -> Article:
        """Get the latest article from a category."""
        self.validate_category(category)

        logger.info(f"Fetching latest {category} article from TechCrunch")

        try:
            # Get latest article URL from RSS
            url = self._get_latest_url_from_rss(category)

            # Fetch and extract article content
            article = self._extract_article(url, category)

            if not article.is_valid():
                raise ScrapingError("Extracted article is not valid: missing content")

            logger.info(f"Successfully extracted article: {article.title}")
            return article

        except Exception as e:
            if isinstance(e, (ScrapingError, NetworkError)):
                raise
            raise ScrapingError(f"Failed to get latest article from {category}: {e}")



[docs]
    def get_multiple_articles(self, category: str, count: int) -> List[Article]:
        """Get multiple articles from a category."""
        self.validate_category(category)

        if count < 1:
            raise ScrapingError("Count must be at least 1")

        logger.info(f"Fetching {count} {category} articles from TechCrunch")

        try:
            # Get multiple URLs from RSS
            urls = self._get_urls_from_rss(category, count)

            articles = []
            for i, url in enumerate(urls):
                try:
                    # Add delay between requests to be respectful
                    if i > 0:
                        time.sleep(1)

                    article = self._extract_article(url, category)
                    if article.is_valid():
                        articles.append(article)
                    else:
                        logger.warning(f"Skipping invalid article: {url}")
                except Exception as e:
                    logger.warning(f"Failed to extract article {url}: {e}")
                    continue

            if not articles:
                raise ScrapingError(f"No valid articles found in {category}")

            logger.info(f"Successfully extracted {len(articles)} articles")
            return articles

        except Exception as e:
            if isinstance(e, (ScrapingError, NetworkError)):
                raise
            raise ScrapingError(f"Failed to get articles from {category}: {e}")


    def _get_latest_url_from_rss(self, category: str) -> str:
        """Get the latest article URL from RSS feed."""
        rss_url = self.RSS_FEEDS[category]

        try:
            logger.debug(f"Fetching RSS feed: {rss_url}")
            response = self.http_client.get(rss_url)
            response.raise_for_status()

            # Parse RSS feed
            feed = feedparser.parse(response.text)

            if not feed.entries:
                raise ScrapingError(f"No articles found in RSS feed: {rss_url}")

            # Get latest article URL
            latest_entry = feed.entries[0]
            if not hasattr(latest_entry, "link"):
                raise ScrapingError(f"RSS entry missing link: {latest_entry}")

            return str(latest_entry.link)

        except Exception as e:
            if isinstance(e, ScrapingError):
                raise
            raise NetworkError(f"Failed to fetch RSS feed {rss_url}: {e}")

    def _get_urls_from_rss(self, category: str, count: int) -> List[str]:
        """Get multiple article URLs from RSS feed."""
        rss_url = self.RSS_FEEDS[category]

        try:
            logger.debug(f"Fetching RSS feed: {rss_url}")
            response = self.http_client.get(rss_url)
            response.raise_for_status()

            # Parse RSS feed
            feed = feedparser.parse(response.text)

            if not feed.entries:
                raise ScrapingError(f"No articles found in RSS feed: {rss_url}")

            # Extract URLs up to requested count
            urls = []
            for entry in feed.entries[:count]:
                if hasattr(entry, "link"):
                    urls.append(entry.link)

            if not urls:
                raise ScrapingError(f"No valid URLs found in RSS feed: {rss_url}")

            return urls

        except Exception as e:
            if isinstance(e, ScrapingError):
                raise
            raise NetworkError(f"Failed to fetch RSS feed {rss_url}: {e}")

    def _extract_article(self, url: str, category: str) -> Article:
        """Extract article content from a TechCrunch URL."""
        try:
            logger.debug(f"Extracting article: {url}")
            response = self.http_client.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title
            title = self._extract_title(soup)
            if not title:
                raise ScrapingError(f"No title found for article: {url}")

            # Extract author
            author = self._extract_author(soup)

            # Extract content
            content = self._extract_content(soup)
            if not content:
                raise ScrapingError(f"No content found for article: {url}")

            # Clean content
            content = self._clean_content(content)

            return Article(
                title=title,
                content=content,
                author=author,
                url=url,
                category=category,
                source=self.name,
            )

        except Exception as e:
            if isinstance(e, (ScrapingError, NetworkError)):
                raise
            raise ScrapingError(f"Failed to extract article {url}: {e}")

    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article title from HTML."""
        # Try multiple selectors for title
        title_selectors = [
            'h1[data-module="ArticleTitle"]',
            "h1.article__title",
            "h1",
            '[data-module="ArticleTitle"]',
            ".post-title",
            ".entry-title",
        ]

        for selector in title_selectors:
            title_elem = soup.select_one(selector)
            if title_elem:
                title = title_elem.get_text().strip()
                if title:
                    return title

        # Fallback to page title
        title_tag = soup.find("title")
        if title_tag:
            title = title_tag.get_text().strip()
            # Remove common TechCrunch suffixes
            title = re.sub(r"\s*\|\s*TechCrunch.*$", "", title)
            return title

        return None

    def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article author from HTML."""
        # Try multiple selectors for author
        author_selectors = [
            '[data-module="ArticleByline"] a',
            ".byline a",
            ".author a",
            ".post-author a",
            '[rel="author"]',
            ".article-author a",
        ]

        for selector in author_selectors:
            author_elem = soup.select_one(selector)
            if author_elem:
                author = author_elem.get_text().strip()
                if author:
                    return author

        return None

    def _extract_content(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract article content from HTML."""
        # Try multiple selectors for content
        content_selectors = [
            '[data-module="ArticleBody"]',
            ".article-content",
            ".post-content",
            ".entry-content",
            ".article__content",
            "div.article-entry",
        ]

        for selector in content_selectors:
            content_elem = soup.select_one(selector)
            if content_elem:
                # Remove unwanted elements
                for unwanted in content_elem.select(
                    "script, style, .advertisement, .ad, .promo"
                ):
                    unwanted.decompose()

                content = content_elem.get_text(separator=" ", strip=True)
                if content and len(content.strip()) > 100:
                    return content

        # Fallback: try to get all paragraphs
        paragraphs = soup.select("p")
        if paragraphs:
            content_parts = []
            for p in paragraphs:
                text = p.get_text().strip()
                if text and not self._should_skip_content(text):
                    content_parts.append(text)

            content = " ".join(content_parts)
            if len(content.strip()) > 100:
                return content

        return None

    def _clean_content(self, content: str) -> str:
        """Clean extracted content."""
        if not content:
            return ""

        # Remove multiple whitespace
        content = re.sub(r"\s+", " ", content)

        # Remove content that matches skip patterns
        lines = content.split(". ")
        cleaned_lines = []

        for line in lines:
            if not self._should_skip_content(line):
                cleaned_lines.append(line)

        return ". ".join(cleaned_lines).strip()

    def _should_skip_content(self, text: str) -> bool:
        """Check if content should be skipped based on patterns."""
        text_lower = text.lower()

        # Skip if matches any skip pattern
        for pattern in self.SKIP_PATTERNS:
            if pattern in text_lower:
                return True

        # Skip if too short
        if len(text.strip()) < 10:
            return True

        return False