Source code for the_data_packet.core.config

"""Unified configuration system for The Data Packet.

This module provides centralized configuration management with support for:
- Environment variable loading
- Type-safe configuration with validation
- Default values for all settings
- Global configuration singleton pattern
- Override capabilities for testing

The configuration system follows these priorities (highest to lowest):
1. Direct parameter overrides
2. Environment variables
3. Default values

Configuration Categories:
    API Keys:
        - Anthropic API key for Claude script generation
        - ElevenLabs API key for TTS audio generation
        - AWS credentials for S3 storage

    Podcast Settings:
        - Show metadata (name, episode numbers)
        - Audio preferences (voices, sample rate)
        - RSS feed configuration

    Processing Options:
        - Which generation steps to run
        - Article collection preferences
        - Output and cleanup settings

    Network Settings:
        - HTTP timeouts and user agents
        - Retry configurations
        - Rate limiting settings

Usage:
    # Get default configuration (loads from environment)
    config = get_config()

    # Override specific values
    config = get_config(
        show_name="My Custom Podcast",
        max_articles_per_source=3
    )

    # Access configuration values
    if config.anthropic_api_key:
        generator = ScriptGenerator(config.anthropic_api_key)

Environment Variables:
    Required for script generation:
        ANTHROPIC_API_KEY - Claude API key

    Required for audio generation:
        GCS_BUCKET_NAME - Google Cloud Storage bucket for long audio synthesis
        GOOGLE_APPLICATION_CREDENTIALS - Path to service account JSON (optional if using default credentials)

    Legacy (deprecated):
        ELEVENLABS_API_KEY - ElevenLabs API key (replaced by Google Cloud TTS)

    Optional for S3 uploads:
        S3_BUCKET_NAME - S3 bucket for hosting
        AWS_ACCESS_KEY_ID - AWS access key
        AWS_SECRET_ACCESS_KEY - AWS secret key
        AWS_REGION - AWS region (default: us-east-1)

    Optional for Grafana Loki log aggregation:
        GRAFANA_LOKI_URL - Loki endpoint URL
        GRAFANA_LOKI_USERNAME - Loki authentication username
        GRAFANA_LOKI_PASSWORD - Loki authentication password/API key

    Optional customizations:
        SHOW_NAME - Podcast name override
        LOG_LEVEL - Logging level (DEBUG/INFO/WARNING/ERROR)
        MAX_ARTICLES - Max articles per source

    Logging configuration:
        LOG_DIRECTORY - Directory for JSONL log files (default: output/logs)
        ENABLE_JSONL_LOGGING - Enable JSONL file logging (true/false, default: true)
        ENABLE_S3_LOG_UPLOAD - Enable S3 upload of logs (true/false, default: true)
        LOG_UPLOAD_INTERVAL - Upload interval in seconds (default: 3600)
        REMOVE_LOGS_AFTER_UPLOAD - Remove local logs after S3 upload (true/false, default: false)
"""

import os
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional

from the_data_packet.core.exceptions import ConfigurationError



[docs]
@dataclass
class Config:
    """Unified configuration for The Data Packet with environment variable support.

    This class provides type-safe configuration management with automatic
    environment variable loading and validation. All fields have sensible
    defaults and can be overridden via environment variables or direct
    parameter passing.

    Attributes:
        API Keys:
            anthropic_api_key: Anthropic API key for Claude script generation.
                              Required for script generation. Loaded from ANTHROPIC_API_KEY.
            elevenlabs_api_key: [DEPRECATED] ElevenLabs API key for legacy TTS.
                               Replaced by Google Cloud TTS. Loaded from ELEVENLABS_API_KEY.
            mongodb_username: MongoDB username for episode tracking and article deduplication.
                             Optional. Loaded from MONGODB_USERNAME.
            mongodb_password: MongoDB password for episode tracking and article deduplication.
                             Optional. Loaded from MONGODB_PASSWORD.

        Google Cloud Configuration:
            google_credentials_path: Path to Google Cloud service account JSON file.
                                   Optional if using default application credentials.
                                   Loaded from GOOGLE_APPLICATION_CREDENTIALS.
            gcs_bucket_name: Google Cloud Storage bucket for long audio synthesis output.
                           Required for audio generation. Loaded from GCS_BUCKET_NAME.

        AWS Configuration:
            aws_access_key_id: AWS access key for S3 uploads. Loaded from AWS_ACCESS_KEY_ID.
            aws_secret_access_key: AWS secret key for S3 uploads. Loaded from AWS_SECRET_ACCESS_KEY.
            aws_region: AWS region for S3 operations. Default: us-east-1.
            s3_bucket_name: S3 bucket name for hosting files. Loaded from S3_BUCKET_NAME.
        Grafana Loki Configuration:
            grafana_loki_url: Loki endpoint URL for log aggregation. Loaded from GRAFANA_LOKI_URL.
            grafana_loki_username: Username for Loki authentication. Loaded from GRAFANA_LOKI_USERNAME.
            grafana_loki_password: Password/API key for Loki authentication. Loaded from GRAFANA_LOKI_PASSWORD.
        Podcast Configuration:
            show_name: Podcast show name. Used in RSS feeds and file names.
            episode_number: Episode number for RSS feeds. Auto-generated if None.
            output_directory: Local directory for generated files.

        Article Collection:
            max_articles_per_source: Maximum articles to collect per source.
            article_sources: List of news sources to use (wired, techcrunch).
            article_categories: List of categories to fetch from each source.
            source_category_mapping: Maps each source to its supported categories.

        AI Generation Settings:
            claude_model: Claude model name for script generation.
            tts_model: Text-to-speech service type (now "google_cloud_tts").
            max_tokens: Maximum tokens for Claude API calls.
            temperature: AI generation temperature (0.0-1.0, lower = more consistent).

        Audio Settings (Google Cloud Studio Multi-speaker voices):
            voice_a: First speaker voice name (Alex - male narrator).
            voice_b: Second speaker voice name (Sam - female narrator).
            audio_sample_rate: Audio sample rate in Hz.

        Processing Options:
            generate_script: Whether to generate podcast scripts.
            generate_audio: Whether to generate audio files.
            generate_rss: Whether to generate RSS feeds.
            save_intermediate_files: Whether to keep intermediate processing files.
            cleanup_temp_files: Whether to clean up temporary files after processing.

        RSS Feed Configuration:
            rss_channel_title: RSS channel title.
            rss_channel_description: RSS channel description.
            rss_channel_link: RSS channel website link.
            rss_channel_image_url: RSS channel artwork URL.
            rss_channel_email: Contact email for podcast.
            max_rss_episodes: Maximum episodes to keep in RSS feed.

        Network Settings:
            http_timeout: HTTP request timeout in seconds.
            user_agent: User agent string for HTTP requests.
            log_level: Logging level (DEBUG/INFO/WARNING/ERROR/CRITICAL).

    Example:
        # Default configuration with environment variables
        config = Config()

        # Custom configuration
        config = Config(
            show_name="Tech News Daily",
            max_articles_per_source=3,
            voice_a="charon",
            voice_b="aoede"
        )

        # Validate before use
        config.validate_for_script_generation()
        config.validate_for_audio_generation()
    """

    # API Keys
    anthropic_api_key: Optional[str] = None
    # Deprecated - use Google Cloud TTS
    elevenlabs_api_key: Optional[str] = None
    mongodb_username: Optional[str] = None
    mongodb_password: Optional[str] = None

    # Google Cloud Configuration
    # Path to service account JSON
    google_credentials_path: Optional[str] = None
    # GCS bucket for long audio synthesis
    gcs_bucket_name: Optional[str] = None

    # AWS Configuration
    aws_access_key_id: Optional[str] = None
    aws_secret_access_key: Optional[str] = None
    aws_region: str = "us-east-1"
    s3_bucket_name: Optional[str] = None

    # Grafana Loki Configuration
    grafana_loki_url: Optional[str] = None
    grafana_loki_username: Optional[str] = None
    grafana_loki_password: Optional[str] = None

    # Podcast Configuration
    show_name: str = "The Data Packet"
    episode_number: Optional[int] = None
    output_directory: Path = Path("./output")

    # Article Collection
    max_articles_per_source: int = 1
    article_sources: List[str] = field(default_factory=lambda: ["wired", "techcrunch"])
    article_categories: List[str] = field(default_factory=lambda: ["security", "ai"])
    source_category_mapping: Dict[str, List[str]] = field(
        default_factory=lambda: {
            "wired": ["security", "science", "ai"],
            "techcrunch": ["ai", "security"],
        }
    )

    # AI Generation Settings
    claude_model: str = "claude-sonnet-4-5-20250929"
    tts_model: str = "google_cloud_tts"  # Updated to use Google Cloud TTS
    max_tokens: int = 3000
    temperature: float = 0.7

    # Audio Settings (Google Cloud Studio Multi-speaker voices)
    male_voice: str = "en-US-Studio-Q"  # Alex (male narrator)
    female_voice: str = "en-US-Studio-O"  # Sam (female narrator)
    audio_sample_rate: int = 44100

    # Processing Options
    generate_script: bool = True
    generate_audio: bool = True
    generate_rss: bool = True
    save_intermediate_files: bool = False
    cleanup_temp_files: bool = True

    # RSS Feed Configuration
    rss_channel_title: Optional[str] = "The Data Packet"
    rss_channel_description: Optional[str] = None
    rss_channel_link: Optional[str] = None
    rss_channel_image_url: Optional[str] = (
        "https://the-data-packet.s3.us-west-2.amazonaws.com/the-data-packet/the_data_packet.png"
    )
    # Contact email for podcast
    rss_channel_email: Optional[str] = "contact@thewintershadow.com"
    max_rss_episodes: int = 500

    # Network Settings
    http_timeout: int = 30
    user_agent: str = (
        "The Data Packet/1.0 (+https://github.com/TheWinterShadow/The-Data-Packet)"
    )

    # Logging
    log_level: str = "INFO"
    log_dir: str = "output/logs"
    enable_jsonl_logging: bool = True
    enable_s3_log_upload: bool = True
    log_upload_interval: int = 3600  # seconds
    remove_logs_after_upload: bool = False


[docs]
    def __post_init__(self) -> None:
        """Load configuration from environment variables."""
        self._load_from_env()
        self._validate()


    def _load_from_env(self) -> None:
        """Load configuration from environment variables."""
        # API Keys
        self.anthropic_api_key = self.anthropic_api_key or os.getenv(
            "ANTHROPIC_API_KEY"
        )
        self.elevenlabs_api_key = self.elevenlabs_api_key or os.getenv(
            "ELEVENLABS_API_KEY"
        )

        # Google Cloud
        self.google_credentials_path = self.google_credentials_path or os.getenv(
            "GOOGLE_APPLICATION_CREDENTIALS"
        )
        self.gcs_bucket_name = self.gcs_bucket_name or os.getenv("GCS_BUCKET_NAME")
        self.mongodb_username = self.mongodb_username or os.getenv("MONGODB_USERNAME")
        self.mongodb_password = self.mongodb_password or os.getenv("MONGODB_PASSWORD")

        # AWS
        self.aws_access_key_id = self.aws_access_key_id or os.getenv(
            "AWS_ACCESS_KEY_ID"
        )
        self.aws_secret_access_key = self.aws_secret_access_key or os.getenv(
            "AWS_SECRET_ACCESS_KEY"
        )
        self.aws_region = os.getenv("AWS_REGION", self.aws_region)
        self.s3_bucket_name = self.s3_bucket_name or os.getenv("S3_BUCKET_NAME")

        # Grafana Loki
        self.grafana_loki_url = self.grafana_loki_url or os.getenv("GRAFANA_LOKI_URL")
        self.grafana_loki_username = self.grafana_loki_username or os.getenv(
            "GRAFANA_LOKI_USERNAME"
        )
        self.grafana_loki_password = self.grafana_loki_password or os.getenv(
            "GRAFANA_LOKI_PASSWORD"
        )

        # Other settings
        if env_show_name := os.getenv("SHOW_NAME"):
            self.show_name = env_show_name
        if env_log_level := os.getenv("LOG_LEVEL"):
            self.log_level = env_log_level
        if env_log_dir := os.getenv("LOG_DIRECTORY"):
            self.log_dir = env_log_dir
        if env_enable_jsonl := os.getenv("ENABLE_JSONL_LOGGING"):
            self.enable_jsonl_logging = env_enable_jsonl.lower() in (
                "true",
                "1",
                "yes",
                "on",
            )
        if env_enable_s3_upload := os.getenv("ENABLE_S3_LOG_UPLOAD"):
            self.enable_s3_log_upload = env_enable_s3_upload.lower() in (
                "true",
                "1",
                "yes",
                "on",
            )
        if env_upload_interval := os.getenv("LOG_UPLOAD_INTERVAL"):
            try:
                self.log_upload_interval = int(env_upload_interval)
            except ValueError:
                pass
        if env_remove_logs := os.getenv("REMOVE_LOGS_AFTER_UPLOAD"):
            self.remove_logs_after_upload = env_remove_logs.lower() in (
                "true",
                "1",
                "yes",
                "on",
            )
        if env_output_dir := os.getenv("OUTPUT_DIRECTORY"):
            self.output_directory = Path(env_output_dir)

        # RSS configuration
        if env_rss_title := os.getenv("RSS_CHANNEL_TITLE"):
            self.rss_channel_title = env_rss_title
        if env_rss_desc := os.getenv("RSS_CHANNEL_DESCRIPTION"):
            self.rss_channel_description = env_rss_desc
        if env_rss_link := os.getenv("RSS_CHANNEL_LINK"):
            self.rss_channel_link = env_rss_link
        if env_rss_image := os.getenv("RSS_CHANNEL_IMAGE_URL"):
            self.rss_channel_image_url = env_rss_image
        if env_rss_email := os.getenv("RSS_CHANNEL_EMAIL"):
            self.rss_channel_email = env_rss_email

        # Convert string env vars to proper types
        if env_max_articles := os.getenv("MAX_ARTICLES_PER_SOURCE"):
            try:
                self.max_articles_per_source = int(env_max_articles)
            except ValueError:
                pass

        if env_max_episodes := os.getenv("MAX_RSS_EPISODES"):
            try:
                self.max_rss_episodes = int(env_max_episodes)
            except ValueError:
                pass

        if env_generate_rss := os.getenv("GENERATE_RSS"):
            self.generate_rss = env_generate_rss.lower() in ("true", "1", "yes")

        if env_timeout := os.getenv("HTTP_TIMEOUT"):
            try:
                self.http_timeout = int(env_timeout)
            except ValueError:
                pass

    def _validate(self) -> None:
        """Validate configuration."""
        errors = []

        # Ensure output directory exists
        if not self.output_directory.exists():
            try:
                self.output_directory.mkdir(parents=True, exist_ok=True)
            except Exception as e:
                errors.append(
                    f"Cannot create output directory {self.output_directory}: {e}"
                )

        # Validate log level
        valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
        if self.log_level.upper() not in valid_log_levels:
            errors.append(f"Invalid log level: {self.log_level}")

        # Validate source-category compatibility
        for source in self.article_sources:
            if source not in self.source_category_mapping:
                errors.append(f"Unknown source: {source}")
            else:
                for category in self.article_categories:
                    if category not in self.source_category_mapping[source]:
                        errors.append(
                            f"Category '{category}' not supported by source '{source}'. "
                            f"Supported categories for {source}: {self.source_category_mapping[source]}"
                        )

        if errors:
            raise ConfigurationError(
                f"Configuration validation failed: {'; '.join(errors)}"
            )


[docs]
    def validate_for_script_generation(self) -> None:
        """Validate configuration for script generation."""
        if not self.anthropic_api_key:
            from .exceptions import ConfigurationError

            raise ConfigurationError(
                "Anthropic API key is required for script generation"
            )



[docs]
    def validate_for_audio_generation(self) -> None:
        """Validate configuration for audio generation."""
        if not self.gcs_bucket_name:
            from .exceptions import ConfigurationError

            raise ConfigurationError(
                "Google Cloud Storage bucket is required for audio generation. "
                "Set gcs_bucket_name in config or GCS_BUCKET_NAME environment variable."
            )



[docs]
    def get_sources_for_category(self, category: str) -> List[str]:
        """Get list of sources that support a given category.

        Args:
            category: Category name to check

        Returns:
            List of source names that support the category
        """
        return [
            source
            for source, categories in self.source_category_mapping.items()
            if category in categories and source in self.article_sources
        ]



[docs]
    def get_categories_for_source(self, source: str) -> List[str]:
        """Get list of categories supported by a given source.

        Args:
            source: Source name to check

        Returns:
            List of category names supported by the source
        """
        if source not in self.source_category_mapping:
            return []
        return [
            category
            for category in self.source_category_mapping[source]
            if category in self.article_categories
        ]



[docs]
    def to_dict(self) -> Dict:
        """Convert configuration to dictionary."""
        result = {}
        for field_name, field_value in self.__dict__.items():
            if isinstance(field_value, Path):
                result[field_name] = str(field_value)
            else:
                result[field_name] = field_value
        return result




# Global configuration instance
_config: Optional[Config] = None



[docs]
def get_config(**overrides: Any) -> Config:
    """
    Get the global configuration instance.

    Args:
        **overrides: Configuration values to override

    Returns:
        Config instance
    """
    global _config
    if _config is None or overrides:
        _config = Config(**overrides)
    return _config




[docs]
def reset_config() -> None:
    """Reset the global configuration instance."""
    global _config
    _config = None