Merge branch 'add-bsn-summary-script' into 'main'

Add BSN summaries script Closes ots/clients/lfc/llm-tracker#2 See merge request !193

Merge branch 'add-bsn-summary-script' into 'main'
f3c9c259 · Chris Zubak-Skees · d3dff30a · 140f4597 · f3c9c259
Commit f3c9c259 authored 1 week ago by Chris Zubak-Skees
--- a/scripts/llm/bsn_summaries.py
+++ b/scripts/llm/bsn_summaries.py
+#!/usr/bin/env python3
+
+"""
+Description:
+  This script generates LLM summaries for BSN proposals.
+
+Prerequisites:
+  1. Optionally, set up a `config.py` file to provide configuration
+     - copy `config.py.tmpl` to `config.py` and fill in the values
+  2. Install dependencies:
+     - `requests`
+
+Installation:
+  $ python -m venv venv
+  $ source ./venv/bin/activate
+  $ pip3 install requests
+
+Usage:
+  $ ./bsn_summaries.py [input CSV file] [output CSV file]
+
+Example:
+  $ ./bsn_summaries.py bsn_proposals.csv bsn_summaries.csv
+"""
+
+try:
+    import config
+except ImportError:
+    config = object()
+
+from dataclasses import asdict, dataclass
+import os
+import requests
+from requests.adapters import HTTPAdapter, Retry
+import argparse
+import logging
+import csv
+
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+logging.getLogger("requests").setLevel(logging.WARNING)
+logging.getLogger("urllib3").setLevel(logging.WARNING)
+
+
+parser = argparse.ArgumentParser(
+    prog="bsn_summaries.py",
+    description="Generate BSN summaries for proposals",
+)
+
+parser.add_argument(
+    "input",
+    type=str,
+    help="The input CSV file containing the proposals",
+)
+parser.add_argument(
+    "output",
+    type=str,
+    help="The output CSV file to write the summaries to",
+)
+
+
+parser.add_argument(
+    "--llm-endpoint",
+    default=os.getenv("LLM_ENDPOINT", getattr(config, "LLM_ENDPOINT", None)),
+    help=(
+        "URL of the LLM API from which used to generate the summaries,"
+        "can alternatively be set in config.py or as an environment variable"
+    ),
+    type=str,
+)
+parser.add_argument(
+    "--llm-api-key",
+    default=os.getenv("LLM_API_KEY", getattr(config, "LLM_API_KEY", None)),
+    help=(
+        "Key for the LLM API from which used to generate the summaries,"
+        "can alternatively be set in config.py or as an environment variable"
+    ),
+    type=str,
+)
+
+
+class APIClient:
+    endpoint: str
+    api_key: str
+
+    def __init__(self, endpoint, api_key):
+        self.endpoint = endpoint
+        self.api_key = api_key
+        self.session = requests.Session()
+
+        retry_strategy = Retry(
+            total=3,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+        )
+
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+
+    def make_request(self, method, path, **kwargs):
+        http_method = getattr(self.session, method)
+        response = http_method(
+            f"{self.endpoint}{path}",
+            **kwargs,
+        )
+
+        response.raise_for_status()
+
+        return response.json()
+
+
+@dataclass
+class AnalysisRequest:
+    text: str
+    considerations: str
+
+
+@dataclass
+class AnalysisResponse:
+    id: str
+    value: str
+
+    def without_intro(self, sep=":"):
+        """
+        We want to remove the first line of the text if it's something like
+        "Here is a summary of the proposal:", so we check if the first line
+        ends with a colon and remove it if it does. Otherwise, we return the
+        text as is.
+        """
+        text = self.value
+
+        lines = text.strip().splitlines()
+        if lines[0].endswith(sep):
+            text = "\n".join(lines[1:]).strip()
+
+        return AnalysisResponse(value=text, id=self.id)
+
+
+class LLM(APIClient):
+    def make_request(self, path, data, **kwargs):
+        return super().make_request(
+            "post",
+            path,
+            json=asdict(data),
+            headers={
+                "accept": "application/json",
+                "Authorization": "Bearer " + self.api_key,
+            },
+            params={"mock": False},
+            **kwargs,
+        )
+
+    def get_analysis(self, text, prompt):
+        response = self.make_request(
+            "draft-analysis",
+            AnalysisRequest(text=text, considerations=prompt),
+        )
+        return AnalysisResponse(value=response["draft-analysis"], id=response["id"])
+
+
+PROMPTS = {
+    "Default": """
+    Please use available data to generate three-sentence paragraphs in past
+    tense about how an organization's proposal made it enter the Bold Solutions
+    Network.
+
+    For instance, for a proposal that looks like:
+    ```
+    Lead Organization: Texas Water Trade
+    Press Release Text: Texas Water Trade's project Clean Water for All Texans
+    proposes a nonprofit water service provider that would deliver clean water
+    to households most in need by deploying affordable onsite water treatment
+    systems through a trained workforce. Key Partners include the University
+    of Texas at El Paso, Columbia University, and Antelope Water Management.
+    Project Achievement Level: Finalist
+    Competition: Lone Star Prize
+    Challenge Completion Year: 2021
+    ```
+
+    Please generate a summary that looks like:
+    ```
+    Texas Water Trade joined the Network in 2021 as a finalist in the Lone Star
+    Prize. It proposed the Clean Water for All Texans project—a nonprofit water
+    service provider that would deliver clean water to households most in need
+    by deploying affordable onsite water treatment systems through a trained
+    workforce, working with the University of Texas at El Paso, Columbia
+    University, and Antelope Water Management.
+    ```
+    """,
+    "General Operations": """
+    Please use available data to generate three-sentence paragraphs in past
+    tense about how the organization entered the Bold Solutions Network.
+
+    For instance, for a proposal that looks like:
+    ```
+    Lead Organization: La Casa de Amistad Inc.
+    Project Description: We empower the Latino/Hispanic community within
+    Michiana by providing educational, cultural and advocacy services in a
+    welcoming, bilingual environment.
+    Project Achievement Level: Awardee
+    Competition: Yield Giving Open Call
+    Challenge Completion Year: 2023
+    ```
+
+    Please generate a summary that looks like:
+    ```
+    La Casa de Amistad joined the Network as an awardee of the Yield Giving
+    Open Call in 2023 for its work in empowering the Latino/Hispanic community
+    within Michiana by providing educational, cultural, and advocacy services
+    in a welcoming, bilingual environment.
+    ```
+    """,
+    "Multiple": """
+    Please use available data to generate five-sentence paragraphs in past
+    tense about how an organization's proposals made it enter the Bold
+    Solutions Network, starting with the most recent proposal, discussed
+    in the first sentence, and mentioning the other proposals in the
+    following sentences.
+
+    For instance, for a proposal that looks like:
+    ```
+    Lead Organization: MiracleFeet
+    Project Description: We treat 80,000 children with clubfoot in 40
+    countries, unlocking a lifetime of play, education, and
+    inclusion—opportunities that are otherwise a distant dream.
+    Project Achievement Level: Semi-Finalist
+    Competition: Build a World of Play Challenge
+    Challenge Completion Year: 2022
+
+    Lead Organization: MiracleFeet
+    Project Description: MiracleFeet, our global coalition to end clubfoot
+    disability, will ensure that every child born with this common condition
+    has access to life-changing treatment.
+    Project Achievement Level: Top 100
+    Competition: MacArthur 100&Change
+    Challenge Completion Year: 2021
+
+    Lead Organization: MiracleFeet
+    Project Achievement Level: Finalist
+    Competition: Lone Star Prize
+    Challenge Completion Year: 2021
+
+    Please generate a summary that looks like:
+    ```
+    MiracleFeet was recently a semi-finalist of the Build a World of Play
+    Challenge in 2022 for its proposal to treat 80,000 children with clubfoot
+    in 40 countries, unlocking a lifetime of play, education, and
+    inclusion—opportunities that are otherwise a distant dream. It first
+    joined the Network in 2021 as one of the Top 100 projects for its proposal
+    to end clubfoot disability, ensuring that every child born with this
+    common condition has access to life-changing treatment. MiracleFeet has
+    been a finalist in the Lone Star Prize, a semi-finalist
+    in the Build a World of Play Challenge, and a Top 100 project
+    in the MacArthur 100&Change competition.
+    ```
+    """,
+}
+
+
+def get_value(proposal, partial_name):
+    """
+    Get the value of a field using a partial field name.
+    """
+
+    for field in proposal:
+        if partial_name in field:
+            return proposal[field]
+
+    return None
+
+
+def generate_bsn_summary(llm, proposals, **kwargs):
+    text = ""
+
+    for proposal in proposals:
+        text = ""
+
+        fields = [
+            "Lead Organization",
+            "Project Achievement Level",
+            "Competition",
+            "Challenge Completion Year",
+            "Press Release Text",
+        ]
+
+        if not get_value(proposal, "Press Release Text"):
+            fields.extend(["Project Description", "Executive Summary"])
+
+        for field in fields:
+            value = get_value(proposal, field)
+            if value and value.strip():
+                text += f"{field}: {value}\n"
+        text += "\n"
+
+    prompt = PROMPTS.get(get_value(proposal, "Application Type"), PROMPTS["Default"])
+
+    if len(proposals) > 1:
+        prompt = PROMPTS["Multiple"]
+
+    logging.info("")
+    logging.info(text)
+
+    summary = llm.get_analysis(text, prompt)
+
+    logging.info("")
+    logging.info(f"Summary: {summary.without_intro().value}")
+    logging.info("")
+
+    return summary
+
+
+def cli():
+    args = parser.parse_args()
+
+    if not args.llm_endpoint or not args.llm_api_key:
+        parser.error("LLM credentials not set")
+
+    llm = LLM(endpoint=args.llm_endpoint, api_key=args.llm_api_key)
+
+    logging.info("")
+    logging.info("Generating BSN summaries")
+    logging.info("")
+
+    orgs = {}
+
+    with (
+        open(args.input, newline="") as infile,
+        open(args.output, "w", newline="") as outfile,
+    ):
+        proposals = csv.DictReader(infile)
+        fieldnames = proposals.fieldnames + ["Summary"]
+        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
+
+        writer.writeheader()
+
+        for proposal in proposals:
+            if proposal["10. Account ID"] not in orgs:
+                orgs[proposal["10. Account ID"]] = []
+
+            orgs[proposal["10. Account ID"]].append(proposal)
+
+        for proposals in orgs.values():
+            try:
+                summary = generate_bsn_summary(llm, proposals, **vars(args))
+
+                for proposal in proposals:
+                    writer.writerow(
+                        {**proposal, "Summary": summary.without_intro().value}
+                    )
+            except Exception as e:
+                logging.error(f"Error generating summary: {e}")
+                continue
+
+    logging.info("")
+
+
+if __name__ == "__main__":
+    cli()