Spaces:

fireworks-ai
/

Fed-AI-Savant

Running

App Files Files Community

RobertoBarrosoLuque commited on Aug 7

Commit

8cc0920

1 Parent(s): 9e054cc

Add scraper

Browse files

Files changed (5) hide show

configs/prompt_library.yaml +28 -0
src/modules/__init__.py +0 -0
src/modules/constants.py +7 -0
src/modules/data_pipeline.py +503 -0
src/modules/llm_completions.py +62 -0

configs/prompt_library.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+extract_rate_decision: |
+  You are an expert economist analyzing Federal Reserve meetings. Your task is to extract the key interest rate decision and provide clear, actionable insights from FOMC meeting minutes.
+  CRITICAL INSTRUCTIONS:
+  1. Look for the specific target range decision in the "Committee Policy Actions" section
+  2. The federal funds rate is expressed as a range (e.g., "4¼ to 4½ percent")
+  3. Extract forward guidance about future policy direction
+  4. Identify key economic factors driving the decision
+  5. Use plain language that business leaders and investors can understand
+  REQUIRED OUTPUT FORMAT:
+  - Action: [Raised/Lowered/Maintained] federal funds rate
+  - Rate: [Current target range, e.g., "4.25%-4.50%"]
+  - Magnitude: [Amount of change, e.g., "0.25 percentage points" or "No change"]
+  - Forward Guidance: [What the Fed signaled about future rate changes in 1-2 sentences]
+  - Key Economic Factors: [List 3-4 main factors that influenced the decision]
+  - Economic Outlook: [Fed's assessment of growth, employment, and inflation in 2-3 sentences]
+  - Market Impact: [Likely implications for businesses, consumers, and markets in 1-2 sentences]
+  SPECIFIC SECTIONS TO ANALYZE:
+  - "Committee Policy Actions" (for the actual rate decision)
+  - "Participants' Views on Current Conditions" (for economic assessment)
+  - Post-meeting statement (for forward guidance)
+  - Staff economic projections (for outlook)
+  Meeting Date: {meeting_date}
+  Title: {meeting_title}
+  Meeting Text: {text}

src/modules/__init__.py ADDED Viewed

File without changes

src/modules/constants.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import yaml
+from pathlib import Path
+_PATH_TO_CONFIGS = Path(__file__).parents[2] / "configs" / "prompt_library.yaml"
+with open(_PATH_TO_CONFIGS, "r") as f:
+    PROMPT_LIBRARY = yaml.safe_load(f)

src/modules/data_pipeline.py ADDED Viewed

	@@ -0,0 +1,503 @@

+import asyncio
+import json
+import os
+import re
+import ssl
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from urllib.parse import urljoin
+import aiohttp
+import certifi
+import requests
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from pydantic import BaseModel
+import pdfplumber
+import tempfile
+from src.modules.llm_completions import get_llm, run_multi_llm_completions
+from src.modules.constants import PROMPT_LIBRARY
+DATA_DIR = Path(__file__).parents[2] / "data"
+class RateDecision(BaseModel):
+    """Enhanced Pydantic model for comprehensive Fed decision analysis"""
+    action: str
+    rate: str
+    magnitude: str
+    forward_guidance: str
+    key_economic_factors: List[str]
+    economic_outlook: str
+    market_impact: str
+class Meeting:
+    """Data model for a Fed meeting"""
+    def __init__(self, date: str, title: str, full_text: str, url: str = ""):
+        self.date = date
+        self.title = title
+        self.full_text = full_text
+        self.url = url
+        self.rate_decision = None
+        self.summary = None
+    def to_dict(self) -> Dict:
+        return {
+            "date": self.date,
+            "title": self.title,
+            "full_text": self.full_text,
+            "url": self.url,
+            "rate_decision": self.rate_decision,
+            "summary": self.summary
+        }
+    @classmethod
+    def from_dict(cls, data: Dict) -> 'Meeting':
+        meeting = cls(data["date"], data["title"], data["full_text"], data.get("url", ""))
+        meeting.rate_decision = data.get("rate_decision")
+        meeting.summary = data.get("summary")
+        return meeting
+class FedScraper:
+    """Scrapes FOMC meeting minutes from federalreserve.gov"""
+    BASE_URL = "https://www.federalreserve.gov"
+    CALENDAR_URL = "https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm"
+    def __init__(self, session: Optional[aiohttp.ClientSession] = None):
+        self.session = session
+        self._own_session = session is None
+    async def __aenter__(self):
+        if self._own_session:
+            # Create SSL context with proper certificate verification
+            ssl_context = ssl.create_default_context(cafile=certifi.where())
+            connector = aiohttp.TCPConnector(ssl=ssl_context)
+            # Add headers to mimic a real browser
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            self.session = aiohttp.ClientSession(
+                connector=connector,
+                headers=headers,
+                timeout=aiohttp.ClientTimeout(total=30)
+            )
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        if self._own_session and self.session:
+            await self.session.close()
+    def get_calendar_page(self) -> BeautifulSoup:
+        """Get the FOMC calendar page"""
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        # Use requests with SSL verification and retry logic
+        session = requests.Session()
+        session.headers.update(headers)
+        try:
+            response = session.get(self.CALENDAR_URL, timeout=30, verify=True)
+            response.raise_for_status()
+            return BeautifulSoup(response.content, 'html.parser')
+        except requests.exceptions.SSLError:
+            print("SSL verification failed, trying without verification...")
+            response = session.get(self.CALENDAR_URL, timeout=30, verify=False)
+            response.raise_for_status()
+            return BeautifulSoup(response.content, 'html.parser')
+    async def scrape_meetings(self, max_meetings: int = 20, year_range: Tuple[int, int] = (2022, 2024)) -> List[
+        Meeting]:
+        """Scrape multiple meetings"""
+        print("Fetching FOMC calendar page...")
+        soup = self.get_calendar_page()
+        print(f"Extracting meeting links for years {year_range[0]}-{year_range[1]}...")
+        meeting_links = self.extract_meeting_links(soup, year_range)
+        pdf_links = [
+            (date, f"FOMC Meeting {date}", link)
+            for date, _, link in meeting_links if link.lower().endswith('.pdf')
+        ]
+        if not meeting_links:
+            print("No meeting links found")
+            return []
+        print(f"Found {len(meeting_links)} meetings")
+        # Limit number of meetings
+        meeting_links = meeting_links[:max_meetings]
+        if len(meeting_links) < len(meeting_links):
+            print(f"Processing first {max_meetings} meetings")
+        meetings = []
+        async with self:  # This will call __aenter__ and __aexit__
+            for i, (date, title, url) in enumerate(pdf_links, 1):
+                try:
+                    print(f"\n[{i}/{len(meeting_links)}] Scraping: {date}")
+                    print(f"  URL: {url}")
+                    content = await self.scrape_meeting_content(url)
+                    if content:
+                        meeting = Meeting(date, title, content, url)
+                        meetings.append(meeting)
+                        print(f"  Successfully extracted {len(content)} characters")
+                    else:
+                        print(f"  No content extracted from {url}")
+                    # Rate limiting - be respectful to Fed servers
+                    if i < len(meeting_links):
+                        print("  Waiting 2 seconds before next request...")
+                        await asyncio.sleep(2)
+                except Exception as e:
+                    print(f"  Error scraping meeting {date}: {e}")
+                    continue
+        print(f"\nSuccessfully scraped {len(meetings)} out of {len(meeting_links)} meetings")
+        return meetings
+    async def scrape_meeting_content(self, url: str) -> str:
+        """Scrape content from HTML pages or extract text from PDF files"""
+        if not self.session:
+            raise RuntimeError("Session not initialized. Use async context manager.")
+        try:
+            async with self.session.get(url) as response:
+                response.raise_for_status()
+                # Check content type
+                content_type = response.headers.get('content-type', '').lower()
+                if 'application/pdf' in content_type or url.lower().endswith('.pdf'):
+                    print(f"  Processing PDF: {url}")
+                    return await self._extract_pdf_text(response)
+                else:
+                    print(f"  Processing HTML: {url}")
+                    return await self._extract_html_text(response)
+        except Exception as e:
+            print(f"  Error processing {url}: {e}")
+            return ""
+    async def _extract_pdf_text(self, response) -> str:
+        """Extract text from PDF using pdfplumber"""
+        try:
+            pdf_content = await response.read()
+            # Create temporary file for pdfplumber processing
+            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
+                tmp_file.write(pdf_content)
+                tmp_file.flush()
+                text_content = []
+                try:
+                    with pdfplumber.open(tmp_file.name) as pdf:
+                        print(f"    Extracting text from {len(pdf.pages)} pages")
+                        for page_num, page in enumerate(pdf.pages):
+                            try:
+                                page_text = page.extract_text()
+                                if page_text and page_text.strip():
+                                    # Clean up common PDF artifacts
+                                    page_text = self._clean_pdf_text(page_text)
+                                    text_content.append(page_text)
+                            except Exception as e:
+                                print(f"    Could not extract text from page {page_num + 1}: {e}")
+                                continue
+                finally:
+                    # Always cleanup temp file
+                    try:
+                        os.unlink(tmp_file.name)
+                    except OSError:
+                        pass
+                # Join all page text
+                return '\n\n'.join(text_content)
+        except Exception as e:
+            print(f"    Error extracting PDF text: {e}")
+            return ""
+    @staticmethod
+    def _clean_pdf_text(text: str) -> str:
+        """Clean common PDF text extraction artifacts"""
+        # Remove excessive whitespace while preserving paragraph breaks
+        text = re.sub(r'[ \t]+', ' ', text)
+        # Fix common PDF line break issues
+        text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', text)  # Rejoin hyphenated words
+        text = re.sub(r'(?<=[.!?])\s*\n\s*(?=[A-Z])', ' ', text)  # Join sentences split across lines
+        # Remove page numbers and headers/footers (common patterns)
+        text = re.sub(r'\n\s*\d+\s*\n', '\n', text)  # Standalone page numbers
+        text = re.sub(r'\n\s*Page \d+ of \d+\s*\n', '\n', text)  # "Page X of Y"
+        return text.strip()
+    @staticmethod
+    async def _extract_html_text(response) -> str:
+        """Extract text from HTML response"""
+        try:
+            try:
+                content = await response.text()
+            except UnicodeDecodeError:
+                # Fallback for encoding issues
+                content_bytes = await response.read()
+                content = content_bytes.decode('utf-8', errors='ignore')
+            soup = BeautifulSoup(content, 'html.parser')
+            # Remove script and style elements
+            for script in soup(["script", "style"]):
+                script.decompose()
+            # Find the main content area
+            content_div = (
+                    soup.find('div', {'class': 'col-xs-12 col-sm-8 col-md-8'}) or
+                    soup.find('div', {'id': 'article'}) or
+                    soup.find('div', {'class': 'content'}) or
+                    soup.find('main') or
+                    soup.body
+            )
+            if content_div:
+                text = content_div.get_text(separator=' ', strip=True)
+                text = re.sub(r'\s+', ' ', text)
+                print(f"    Extracted {len(text)} characters from HTML")
+                return text.strip()
+            print("    No content found in HTML")
+            return ""
+        except Exception as e:
+            print(f"    Error extracting HTML text: {e}")
+            return ""
+    def extract_meeting_links(self, soup: BeautifulSoup, year_range: Tuple[int, int] = (2022, 2024)) -> List[
+        Tuple[str, str, str]]:
+        """Extract meeting links from the calendar page - handles both HTML and PDF"""
+        meetings = []
+        for link in soup.find_all('a', href=True):
+            href = link.get('href', '')
+            text = link.get_text().strip()
+            # Find links to meeting minutes (HTML or PDF)
+            if ('minutes' in href.lower() and
+                    ('fomcminutes' in href or 'fomc/minutes' in href)):
+                date_match = re.search(r'(\d{4})(\d{2})(\d{2})', href)
+                if date_match:
+                    year, month, day = date_match.groups()
+                    year_int = int(year)
+                    if year_range[0] <= year_int <= year_range[1]:
+                        date_str = f"{year}-{month}-{day}"
+                        full_url = urljoin(self.BASE_URL, href)
+                        # Identify content type in title
+                        content_type = "PDF" if href.lower().endswith('.pdf') else "HTML"
+                        title_with_type = f"{text or 'FOMC Meeting ' + date_str} ({content_type})"
+                        meetings.append((date_str, title_with_type, full_url))
+        meetings.sort(key=lambda x: x[0], reverse=True)
+        return meetings
+class DataProcessor:
+    """Processes scraped meeting data using LLM analysis"""
+    def __init__(self, api_key: str, model: str = "small"):
+        self.api_key = api_key
+        self.model = model
+        self.llm = get_llm(model, api_key)
+    async def process_meetings(self, meetings: List[Meeting]) -> List[str]:
+        """Process all meetings with LLM analysis"""
+        print(f"Processing {len(meetings)} meetings with LLM analysis...")
+        prompts = [
+            PROMPT_LIBRARY['extract_rate_decision'].format(
+                meeting_date=meeting.date,
+                meeting_title=meeting.title,
+                text=meeting.full_text if len(meeting.full_text) < 100000 else meeting.full_text[:100000]
+            )
+            for meeting in meetings
+        ]
+        meetings_extracted = await run_multi_llm_completions(
+            llm=self.llm,
+            prompts=prompts,
+            output_class=RateDecision
+        )
+        return meetings_extracted
+class FedDataPipeline:
+    """Main pipeline for scraping and processing Fed meeting data"""
+    def __init__(self, api_key: str, model: str = "small"):
+        self.api_key = api_key
+        self.model = model
+        self.data_dir = DATA_DIR
+        self.data_dir.mkdir(exist_ok=True)
+        self.scraper = FedScraper()
+        self.processor = DataProcessor(api_key, model)
+    def save_meetings(self, meetings: List[Meeting], filename: str = None) -> str:
+        """Save meetings to JSON file"""
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"fed_meetings_{timestamp}.json"
+        filepath = self.data_dir / filename
+        meetings_data = [meeting.to_dict() for meeting in meetings]
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(meetings_data, f, indent=2, ensure_ascii=False)
+        print(f"Saved {len(meetings)} meetings to {filepath}")
+        return str(filepath)
+    def load_meetings(self, filename: str) -> List[Meeting]:
+        """Load meetings from JSON file"""
+        filepath = self.data_dir / filename if not os.path.isabs(filename) else Path(filename)
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        meetings = [Meeting.from_dict(item) for item in data]
+        print(f"Loaded {len(meetings)} meetings from {filepath}")
+        return meetings
+    async def process_from_scraped_data(self, scraped_filename: str) -> str:
+        """Process already scraped data with LLM analysis"""
+        print(f"Loading scraped data from: {scraped_filename}")
+        meetings = self.load_meetings(scraped_filename)
+        if not meetings:
+            print("No meetings found in scraped data")
+            return ""
+        print(f"\nProcessing {len(meetings)} meetings with LLM analysis...")
+        processed_results = await self.processor.process_meetings(meetings)
+        # Update meetings with processed results
+        if len(processed_results) == len(meetings):
+            for i, result in enumerate(processed_results):
+                meetings[i].rate_decision = result.dict() if hasattr(result, 'dict') else result
+        # Save final processed data
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        processed_filename = f"fed_meetings_processed_{timestamp}.json"
+        output_file = self.save_meetings(meetings, processed_filename)
+        print("\nProcessing completed successfully!")
+        print(f"Processed data: {output_file}")
+        return output_file
+    async def run_pipeline(self, max_meetings: int = 20, year_range: Tuple[int, int] = (2022, 2024)) -> str:
+        """Run the complete data pipeline"""
+        print("Starting Fed AI Savant Data Pipeline...")
+        # Step 1: Scrape meeting data
+        print("\n1. Scraping FOMC meeting minutes...")
+        meetings = await self.scraper.scrape_meetings(max_meetings, year_range)
+        print(f"Scraped {len(meetings)} meetings")
+        if not meetings:
+            print("No meetings found to process")
+            return ""
+        # Save intermediate scraped data (before LLM processing)
+        print("\n1.5. Saving intermediate scraped data...")
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        scraped_filename = f"fed_meetings_scraped_{timestamp}.json"
+        scraped_filepath = self.save_meetings(meetings, scraped_filename)
+        print(f"Intermediate scraped data saved to: {scraped_filepath}")
+        # Step 2: Process with LLM analysis
+        print("\n2. Processing meetings with LLM analysis...")
+        processed_results = await self.processor.process_meetings(meetings)
+        # Update meetings with processed results
+        if len(processed_results) == len(meetings):
+            for i, result in enumerate(processed_results):
+                meetings[i].rate_decision = result.dict() if hasattr(result, 'dict') else result
+        # Step 3: Save final processed data
+        print("\n3. Saving final processed data...")
+        processed_filename = f"fed_meetings_processed_{timestamp}.json"
+        output_file = self.save_meetings(meetings, processed_filename)
+        print("\nPipeline completed successfully!")
+        print(f"Scraped data: {scraped_filepath}")
+        print(f"Processed data: {output_file}")
+        return output_file
+async def main():
+    """Main function for running the pipeline as a script"""
+    import argparse
+    # Load environment variables
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="Fed AI Savant Data Pipeline")
+    parser.add_argument("--max-meetings", type=int, default=3, help="Maximum number of meetings to scrape")
+    parser.add_argument("--start-year", type=int, default=2022, help="Start year for meeting range")
+    parser.add_argument("--end-year", type=int, default=2025, help="End year for meeting range")
+    parser.add_argument("--data-dir", default="data", help="Directory to save data files")
+    parser.add_argument("--from-scraped", type=str, help="Process from already scraped data file (skips scraping)")
+    args = parser.parse_args()
+    # Get API key from environment
+    api_key = os.getenv("FIREWORKS_API_KEY")
+    if not api_key:
+        print("Error: FIREWORKS_API_KEY not found in environment variables")
+        print("Please create a .env file with: FIREWORKS_API_KEY=your_api_key_here")
+        return
+    # Create and run pipeline (using default "small" model)
+    pipeline = FedDataPipeline(
+        api_key=api_key,
+        model="small",
+        data_dir=args.data_dir
+    )
+    # Check if processing from already scraped data
+    if args.from_scraped:
+        print(f"Processing from scraped data: {args.from_scraped}")
+        output_file = await pipeline.process_from_scraped_data(args.from_scraped)
+    else:
+        year_range = (args.start_year, args.end_year)
+        output_file = await pipeline.run_pipeline(args.max_meetings, year_range)
+    if output_file:
+        print(f"\nSuccessfully completed! Data saved to: {output_file}")
+    else:
+        print("\nPipeline failed or no data processed")
+if __name__ == "__main__":
+    asyncio.run(main())

src/modules/llm_completions.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from fireworks import LLM
+from pydantic import BaseModel
+import asyncio
+MODELS = {
+    "small": "accounts/fireworks/models/gpt-oss-20b",
+    "large": "accounts/fireworks/models/gpt-oss-120b"
+}
+semaphore = asyncio.Semaphore(100)
+def get_llm(model: str, api_key: str) -> LLM:
+    return LLM(model=MODELS[model], api_key=api_key, deployment_type="serverless")
+async def get_llm_completion(llm: LLM, prompt_text: str, output_class: BaseModel = None) -> str:
+    if isinstance(output_class, BaseModel):
+        return llm.chat.completions.create(
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt_text
+                },
+            ],
+            temperature=0.1,
+            output_class=output_class
+        )
+    return llm.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt_text
+            },
+        ],
+        temperature=0.1
+    )
+async def run_multi_llm_completions(llm: LLM, prompts: list[str], output_class: BaseModel) -> list[str]:
+    """
+    Run multiple LLM completions in parallel
+    :param llm:
+    :param prompts:
+    :param output_class:
+    :return:
+    """
+    with semaphore:
+        if isinstance(output_class, BaseModel):
+            tasks = [
+                asyncio.create_task(
+                    get_llm_completion(llm=llm, prompt_text=prompt, output_class=output_class)
+                ) for prompt in prompts
+            ]
+        else:
+            tasks = [
+                asyncio.create_task(
+                    get_llm_completion(llm=llm, prompt_text=prompt)
+                ) for prompt in prompts
+            ]
+    return await asyncio.gather(*tasks)