From 140f4597c9cde00a411cb95149183b26dc8b7715 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees <chriszs@gmail.com> Date: Fri, 14 Feb 2025 19:24:38 +0400 Subject: [PATCH] Add BSN summaries script Closes https://code.librehq.com/ots/clients/lfc/llm-tracker/-/issues/2 --- scripts/llm/bsn_summaries.py | 357 +++++++++++++++++++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 scripts/llm/bsn_summaries.py diff --git a/scripts/llm/bsn_summaries.py b/scripts/llm/bsn_summaries.py new file mode 100644 index 00000000..e01a0a42 --- /dev/null +++ b/scripts/llm/bsn_summaries.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python3 + +""" +Description: + This script generates LLM summaries for BSN proposals. + +Prerequisites: + 1. Optionally, set up a `config.py` file to provide configuration + - copy `config.py.tmpl` to `config.py` and fill in the values + 2. Install dependencies: + - `requests` + +Installation: + $ python -m venv venv + $ source ./venv/bin/activate + $ pip3 install requests + +Usage: + $ ./bsn_summaries.py [input CSV file] [output CSV file] + +Example: + $ ./bsn_summaries.py bsn_proposals.csv bsn_summaries.csv +""" + +try: + import config +except ImportError: + config = object() + +from dataclasses import asdict, dataclass +import os +import requests +from requests.adapters import HTTPAdapter, Retry +import argparse +import logging +import csv + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + + +parser = argparse.ArgumentParser( + prog="bsn_summaries.py", + description="Generate BSN summaries for proposals", +) + +parser.add_argument( + "input", + type=str, + help="The input CSV file containing the proposals", +) +parser.add_argument( + "output", + type=str, + help="The output CSV file to write the summaries to", +) + + +parser.add_argument( + "--llm-endpoint", + default=os.getenv("LLM_ENDPOINT", getattr(config, "LLM_ENDPOINT", None)), + help=( + "URL of the LLM API from which used to generate the summaries," + "can alternatively be set in config.py or as an environment variable" + ), + type=str, +) +parser.add_argument( + "--llm-api-key", + default=os.getenv("LLM_API_KEY", getattr(config, "LLM_API_KEY", None)), + help=( + "Key for the LLM API from which used to generate the summaries," + "can alternatively be set in config.py or as an environment variable" + ), + type=str, +) + + +class APIClient: + endpoint: str + api_key: str + + def __init__(self, endpoint, api_key): + self.endpoint = endpoint + self.api_key = api_key + self.session = requests.Session() + + retry_strategy = Retry( + total=3, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + def make_request(self, method, path, **kwargs): + http_method = getattr(self.session, method) + response = http_method( + f"{self.endpoint}{path}", + **kwargs, + ) + + response.raise_for_status() + + return response.json() + + +@dataclass +class AnalysisRequest: + text: str + considerations: str + + +@dataclass +class AnalysisResponse: + id: str + value: str + + def without_intro(self, sep=":"): + """ + We want to remove the first line of the text if it's something like + "Here is a summary of the proposal:", so we check if the first line + ends with a colon and remove it if it does. Otherwise, we return the + text as is. + """ + text = self.value + + lines = text.strip().splitlines() + if lines[0].endswith(sep): + text = "\n".join(lines[1:]).strip() + + return AnalysisResponse(value=text, id=self.id) + + +class LLM(APIClient): + def make_request(self, path, data, **kwargs): + return super().make_request( + "post", + path, + json=asdict(data), + headers={ + "accept": "application/json", + "Authorization": "Bearer " + self.api_key, + }, + params={"mock": False}, + **kwargs, + ) + + def get_analysis(self, text, prompt): + response = self.make_request( + "draft-analysis", + AnalysisRequest(text=text, considerations=prompt), + ) + return AnalysisResponse(value=response["draft-analysis"], id=response["id"]) + + +PROMPTS = { + "Default": """ + Please use available data to generate three-sentence paragraphs in past + tense about how an organization's proposal made it enter the Bold Solutions + Network. + + For instance, for a proposal that looks like: + ``` + Lead Organization: Texas Water Trade + Press Release Text: Texas Water Trade's project Clean Water for All Texans + proposes a nonprofit water service provider that would deliver clean water + to households most in need by deploying affordable onsite water treatment + systems through a trained workforce. Key Partners include the University + of Texas at El Paso, Columbia University, and Antelope Water Management. + Project Achievement Level: Finalist + Competition: Lone Star Prize + Challenge Completion Year: 2021 + ``` + + Please generate a summary that looks like: + ``` + Texas Water Trade joined the Network in 2021 as a finalist in the Lone Star + Prize. It proposed the Clean Water for All Texans project—a nonprofit water + service provider that would deliver clean water to households most in need + by deploying affordable onsite water treatment systems through a trained + workforce, working with the University of Texas at El Paso, Columbia + University, and Antelope Water Management. + ``` + """, + "General Operations": """ + Please use available data to generate three-sentence paragraphs in past + tense about how the organization entered the Bold Solutions Network. + + For instance, for a proposal that looks like: + ``` + Lead Organization: La Casa de Amistad Inc. + Project Description: We empower the Latino/Hispanic community within + Michiana by providing educational, cultural and advocacy services in a + welcoming, bilingual environment. + Project Achievement Level: Awardee + Competition: Yield Giving Open Call + Challenge Completion Year: 2023 + ``` + + Please generate a summary that looks like: + ``` + La Casa de Amistad joined the Network as an awardee of the Yield Giving + Open Call in 2023 for its work in empowering the Latino/Hispanic community + within Michiana by providing educational, cultural, and advocacy services + in a welcoming, bilingual environment. + ``` + """, + "Multiple": """ + Please use available data to generate five-sentence paragraphs in past + tense about how an organization's proposals made it enter the Bold + Solutions Network, starting with the most recent proposal, discussed + in the first sentence, and mentioning the other proposals in the + following sentences. + + For instance, for a proposal that looks like: + ``` + Lead Organization: MiracleFeet + Project Description: We treat 80,000 children with clubfoot in 40 + countries, unlocking a lifetime of play, education, and + inclusion—opportunities that are otherwise a distant dream. + Project Achievement Level: Semi-Finalist + Competition: Build a World of Play Challenge + Challenge Completion Year: 2022 + + Lead Organization: MiracleFeet + Project Description: MiracleFeet, our global coalition to end clubfoot + disability, will ensure that every child born with this common condition + has access to life-changing treatment. + Project Achievement Level: Top 100 + Competition: MacArthur 100&Change + Challenge Completion Year: 2021 + + Lead Organization: MiracleFeet + Project Achievement Level: Finalist + Competition: Lone Star Prize + Challenge Completion Year: 2021 + + Please generate a summary that looks like: + ``` + MiracleFeet was recently a semi-finalist of the Build a World of Play + Challenge in 2022 for its proposal to treat 80,000 children with clubfoot + in 40 countries, unlocking a lifetime of play, education, and + inclusion—opportunities that are otherwise a distant dream. It first + joined the Network in 2021 as one of the Top 100 projects for its proposal + to end clubfoot disability, ensuring that every child born with this + common condition has access to life-changing treatment. MiracleFeet has + been a finalist in the Lone Star Prize, a semi-finalist + in the Build a World of Play Challenge, and a Top 100 project + in the MacArthur 100&Change competition. + ``` + """, +} + + +def get_value(proposal, partial_name): + """ + Get the value of a field using a partial field name. + """ + + for field in proposal: + if partial_name in field: + return proposal[field] + + return None + + +def generate_bsn_summary(llm, proposals, **kwargs): + text = "" + + for proposal in proposals: + text = "" + + fields = [ + "Lead Organization", + "Project Achievement Level", + "Competition", + "Challenge Completion Year", + "Press Release Text", + ] + + if not get_value(proposal, "Press Release Text"): + fields.extend(["Project Description", "Executive Summary"]) + + for field in fields: + value = get_value(proposal, field) + if value and value.strip(): + text += f"{field}: {value}\n" + text += "\n" + + prompt = PROMPTS.get(get_value(proposal, "Application Type"), PROMPTS["Default"]) + + if len(proposals) > 1: + prompt = PROMPTS["Multiple"] + + logging.info("") + logging.info(text) + + summary = llm.get_analysis(text, prompt) + + logging.info("") + logging.info(f"Summary: {summary.without_intro().value}") + logging.info("") + + return summary + + +def cli(): + args = parser.parse_args() + + if not args.llm_endpoint or not args.llm_api_key: + parser.error("LLM credentials not set") + + llm = LLM(endpoint=args.llm_endpoint, api_key=args.llm_api_key) + + logging.info("") + logging.info("Generating BSN summaries") + logging.info("") + + orgs = {} + + with ( + open(args.input, newline="") as infile, + open(args.output, "w", newline="") as outfile, + ): + proposals = csv.DictReader(infile) + fieldnames = proposals.fieldnames + ["Summary"] + writer = csv.DictWriter(outfile, fieldnames=fieldnames) + + writer.writeheader() + + for proposal in proposals: + if proposal["10. Account ID"] not in orgs: + orgs[proposal["10. Account ID"]] = [] + + orgs[proposal["10. Account ID"]].append(proposal) + + for proposals in orgs.values(): + try: + summary = generate_bsn_summary(llm, proposals, **vars(args)) + + for proposal in proposals: + writer.writerow( + {**proposal, "Summary": summary.without_intro().value} + ) + except Exception as e: + logging.error(f"Error generating summary: {e}") + continue + + logging.info("") + + +if __name__ == "__main__": + cli() -- GitLab