Skip to content
Snippets Groups Projects
Commit f3c9c259 authored by Chris Zubak-Skees's avatar Chris Zubak-Skees
Browse files

Merge branch 'add-bsn-summary-script' into 'main'

Add BSN summaries script

Closes ots/clients/lfc/llm-tracker#2

See merge request !193
parents d3dff30a 140f4597
No related branches found
No related tags found
1 merge request!193Add BSN summaries script
#!/usr/bin/env python3
"""
Description:
This script generates LLM summaries for BSN proposals.
Prerequisites:
1. Optionally, set up a `config.py` file to provide configuration
- copy `config.py.tmpl` to `config.py` and fill in the values
2. Install dependencies:
- `requests`
Installation:
$ python -m venv venv
$ source ./venv/bin/activate
$ pip3 install requests
Usage:
$ ./bsn_summaries.py [input CSV file] [output CSV file]
Example:
$ ./bsn_summaries.py bsn_proposals.csv bsn_summaries.csv
"""
try:
import config
except ImportError:
config = object()
from dataclasses import asdict, dataclass
import os
import requests
from requests.adapters import HTTPAdapter, Retry
import argparse
import logging
import csv
logging.basicConfig(level=logging.INFO, format="%(message)s")
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
parser = argparse.ArgumentParser(
prog="bsn_summaries.py",
description="Generate BSN summaries for proposals",
)
parser.add_argument(
"input",
type=str,
help="The input CSV file containing the proposals",
)
parser.add_argument(
"output",
type=str,
help="The output CSV file to write the summaries to",
)
parser.add_argument(
"--llm-endpoint",
default=os.getenv("LLM_ENDPOINT", getattr(config, "LLM_ENDPOINT", None)),
help=(
"URL of the LLM API from which used to generate the summaries,"
"can alternatively be set in config.py or as an environment variable"
),
type=str,
)
parser.add_argument(
"--llm-api-key",
default=os.getenv("LLM_API_KEY", getattr(config, "LLM_API_KEY", None)),
help=(
"Key for the LLM API from which used to generate the summaries,"
"can alternatively be set in config.py or as an environment variable"
),
type=str,
)
class APIClient:
endpoint: str
api_key: str
def __init__(self, endpoint, api_key):
self.endpoint = endpoint
self.api_key = api_key
self.session = requests.Session()
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def make_request(self, method, path, **kwargs):
http_method = getattr(self.session, method)
response = http_method(
f"{self.endpoint}{path}",
**kwargs,
)
response.raise_for_status()
return response.json()
@dataclass
class AnalysisRequest:
text: str
considerations: str
@dataclass
class AnalysisResponse:
id: str
value: str
def without_intro(self, sep=":"):
"""
We want to remove the first line of the text if it's something like
"Here is a summary of the proposal:", so we check if the first line
ends with a colon and remove it if it does. Otherwise, we return the
text as is.
"""
text = self.value
lines = text.strip().splitlines()
if lines[0].endswith(sep):
text = "\n".join(lines[1:]).strip()
return AnalysisResponse(value=text, id=self.id)
class LLM(APIClient):
def make_request(self, path, data, **kwargs):
return super().make_request(
"post",
path,
json=asdict(data),
headers={
"accept": "application/json",
"Authorization": "Bearer " + self.api_key,
},
params={"mock": False},
**kwargs,
)
def get_analysis(self, text, prompt):
response = self.make_request(
"draft-analysis",
AnalysisRequest(text=text, considerations=prompt),
)
return AnalysisResponse(value=response["draft-analysis"], id=response["id"])
PROMPTS = {
"Default": """
Please use available data to generate three-sentence paragraphs in past
tense about how an organization's proposal made it enter the Bold Solutions
Network.
For instance, for a proposal that looks like:
```
Lead Organization: Texas Water Trade
Press Release Text: Texas Water Trade's project Clean Water for All Texans
proposes a nonprofit water service provider that would deliver clean water
to households most in need by deploying affordable onsite water treatment
systems through a trained workforce. Key Partners include the University
of Texas at El Paso, Columbia University, and Antelope Water Management.
Project Achievement Level: Finalist
Competition: Lone Star Prize
Challenge Completion Year: 2021
```
Please generate a summary that looks like:
```
Texas Water Trade joined the Network in 2021 as a finalist in the Lone Star
Prize. It proposed the Clean Water for All Texans project—a nonprofit water
service provider that would deliver clean water to households most in need
by deploying affordable onsite water treatment systems through a trained
workforce, working with the University of Texas at El Paso, Columbia
University, and Antelope Water Management.
```
""",
"General Operations": """
Please use available data to generate three-sentence paragraphs in past
tense about how the organization entered the Bold Solutions Network.
For instance, for a proposal that looks like:
```
Lead Organization: La Casa de Amistad Inc.
Project Description: We empower the Latino/Hispanic community within
Michiana by providing educational, cultural and advocacy services in a
welcoming, bilingual environment.
Project Achievement Level: Awardee
Competition: Yield Giving Open Call
Challenge Completion Year: 2023
```
Please generate a summary that looks like:
```
La Casa de Amistad joined the Network as an awardee of the Yield Giving
Open Call in 2023 for its work in empowering the Latino/Hispanic community
within Michiana by providing educational, cultural, and advocacy services
in a welcoming, bilingual environment.
```
""",
"Multiple": """
Please use available data to generate five-sentence paragraphs in past
tense about how an organization's proposals made it enter the Bold
Solutions Network, starting with the most recent proposal, discussed
in the first sentence, and mentioning the other proposals in the
following sentences.
For instance, for a proposal that looks like:
```
Lead Organization: MiracleFeet
Project Description: We treat 80,000 children with clubfoot in 40
countries, unlocking a lifetime of play, education, and
inclusion—opportunities that are otherwise a distant dream.
Project Achievement Level: Semi-Finalist
Competition: Build a World of Play Challenge
Challenge Completion Year: 2022
Lead Organization: MiracleFeet
Project Description: MiracleFeet, our global coalition to end clubfoot
disability, will ensure that every child born with this common condition
has access to life-changing treatment.
Project Achievement Level: Top 100
Competition: MacArthur 100&Change
Challenge Completion Year: 2021
Lead Organization: MiracleFeet
Project Achievement Level: Finalist
Competition: Lone Star Prize
Challenge Completion Year: 2021
Please generate a summary that looks like:
```
MiracleFeet was recently a semi-finalist of the Build a World of Play
Challenge in 2022 for its proposal to treat 80,000 children with clubfoot
in 40 countries, unlocking a lifetime of play, education, and
inclusion—opportunities that are otherwise a distant dream. It first
joined the Network in 2021 as one of the Top 100 projects for its proposal
to end clubfoot disability, ensuring that every child born with this
common condition has access to life-changing treatment. MiracleFeet has
been a finalist in the Lone Star Prize, a semi-finalist
in the Build a World of Play Challenge, and a Top 100 project
in the MacArthur 100&Change competition.
```
""",
}
def get_value(proposal, partial_name):
"""
Get the value of a field using a partial field name.
"""
for field in proposal:
if partial_name in field:
return proposal[field]
return None
def generate_bsn_summary(llm, proposals, **kwargs):
text = ""
for proposal in proposals:
text = ""
fields = [
"Lead Organization",
"Project Achievement Level",
"Competition",
"Challenge Completion Year",
"Press Release Text",
]
if not get_value(proposal, "Press Release Text"):
fields.extend(["Project Description", "Executive Summary"])
for field in fields:
value = get_value(proposal, field)
if value and value.strip():
text += f"{field}: {value}\n"
text += "\n"
prompt = PROMPTS.get(get_value(proposal, "Application Type"), PROMPTS["Default"])
if len(proposals) > 1:
prompt = PROMPTS["Multiple"]
logging.info("")
logging.info(text)
summary = llm.get_analysis(text, prompt)
logging.info("")
logging.info(f"Summary: {summary.without_intro().value}")
logging.info("")
return summary
def cli():
args = parser.parse_args()
if not args.llm_endpoint or not args.llm_api_key:
parser.error("LLM credentials not set")
llm = LLM(endpoint=args.llm_endpoint, api_key=args.llm_api_key)
logging.info("")
logging.info("Generating BSN summaries")
logging.info("")
orgs = {}
with (
open(args.input, newline="") as infile,
open(args.output, "w", newline="") as outfile,
):
proposals = csv.DictReader(infile)
fieldnames = proposals.fieldnames + ["Summary"]
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for proposal in proposals:
if proposal["10. Account ID"] not in orgs:
orgs[proposal["10. Account ID"]] = []
orgs[proposal["10. Account ID"]].append(proposal)
for proposals in orgs.values():
try:
summary = generate_bsn_summary(llm, proposals, **vars(args))
for proposal in proposals:
writer.writerow(
{**proposal, "Summary": summary.without_intro().value}
)
except Exception as e:
logging.error(f"Error generating summary: {e}")
continue
logging.info("")
if __name__ == "__main__":
cli()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment