From 1e9848051a0d317dacef800259e86c975f78372e Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Sat, 2 May 2026 00:15:09 -0500 Subject: [PATCH 1/7] Enable GCP budget runner --- runners/monthly.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/monthly.sh b/runners/monthly.sh index c7983af..8c2db2d 100644 --- a/runners/monthly.sh +++ b/runners/monthly.sh @@ -8,6 +8,6 @@ ./api-tools -headless -verbose -upload -map # scrape, parse, and upload budgets -#./api-tools -headless -verbose -scrape -budgets -useBackupBudgets -#./api-tools -headless -verbose -parse -budgets -useBackupBudgets -#./api-tools -headless -verbose -upload -budgets -useBackupBudgets +./api-tools -headless -verbose -scrape -budgets -useBackupBudgets +./api-tools -headless -verbose -parse -budgets -useBackupBudgets +./api-tools -headless -verbose -upload -budgets -useBackupBudgets From 634c0e31123c7c42673f883c63a27bd00ae0fd84 Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Sun, 10 May 2026 17:21:02 -0500 Subject: [PATCH 2/7] Add cache invalidation warning --- parser/academicCalendarsParser.go | 1 + parser/budgetsParser.go | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/parser/academicCalendarsParser.go b/parser/academicCalendarsParser.go index 2ffb03e..8510fb6 100644 --- a/parser/academicCalendarsParser.go +++ b/parser/academicCalendarsParser.go @@ -27,6 +27,7 @@ import ( ) // What gets sent to Gemini, with the PDF content added +// WARNING: Changes to this prompt will invalidate all cached AI responses, only change if necessary var academicCalendarPrompt = `Parse this PDF content and generate the following JSON schema. { diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index 708e20e..c762da3 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -27,6 +27,7 @@ import ( ) // What gets sent to Gemini, with the PDF content added +// WARNING: Changes to this prompt will invalidate all cached AI responses, only change if necessary var budgetPrompt = `Parse the content of these PDFs and generate the following JSON schema. { @@ -242,7 +243,7 @@ var budgetPrompt = `Parse the content of these PDFs and generate the following J } } -- Use the full UTD school names in this title text: School of Arts, Humanities, and Technology; School of Behavioral and Brain Sciences; School of Economic, Political and Policy Sciences; School of Engineering and Computer Science; School of Interdisciplinary Studies; School of Management; School of Natural Sciences and Mathematics. +- Use the full UTD school names in this title-case text: School of Arts, Humanities, and Technology; School of Behavioral and Brain Sciences; School of Economic, Political and Policy Sciences; School of Engineering and Computer Science; School of Interdisciplinary Studies; School of Management; School of Natural Sciences and Mathematics. - In older years: School of Arts, Technology, and Emerging Communication; School of Arts & Humanities. - Replace Brian with Brain in the School of Behavioral and Brain Sciences name if it is misspelled in the PDF. - Always use the data listed for %s, not any previous years. From 74f6593aec7553f525b115b4e11f7baccf383ed1 Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Sun, 24 May 2026 22:07:55 -0500 Subject: [PATCH 3/7] Pull schema change without notes field --- go.mod | 2 +- go.sum | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index fe3a6ea..29a1aca 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.26 require ( github.com/PuerkitoBio/goquery v1.12.0 - github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe + github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91 github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc github.com/chromedp/chromedp v0.15.1 github.com/dongri/phonenumber v0.1.12 diff --git a/go.sum b/go.sum index f9d36bf..f80140d 100644 --- a/go.sum +++ b/go.sum @@ -64,6 +64,8 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 h1:vp2hsJ github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244/go.mod h1:lp0oZHhVmqAqm0gf6Ald2jZXepZ0xFheTsW76T9wC7I= github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3Up3U7PKvWV7yyZ7ouvNd8081Zwmd4p5NFD3kk4= github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91 h1:KUwnKeedRHYncIcVYHMtXVmGUSp0LTxnbtO566GqC+c= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= From d1d63199447ecf13c3da6842051ba0abbded91cd Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Mon, 25 May 2026 00:35:40 -0500 Subject: [PATCH 4/7] Specify more tables to make optional on earlier docs --- go.mod | 2 +- go.sum | 2 ++ parser/budgetsParser.go | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 29a1aca..50d3739 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.26 require ( github.com/PuerkitoBio/goquery v1.12.0 - github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91 + github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc github.com/chromedp/chromedp v0.15.1 github.com/dongri/phonenumber v0.1.12 diff --git a/go.sum b/go.sum index f80140d..2737f22 100644 --- a/go.sum +++ b/go.sum @@ -66,6 +66,8 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3U github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91 h1:KUwnKeedRHYncIcVYHMtXVmGUSp0LTxnbtO566GqC+c= github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf h1:5IIliVrXFa8zyLrWkvK6Z5gtXVw8nrdsXWJkZqRONAU= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index c762da3..4ef9522 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -250,6 +250,8 @@ var budgetPrompt = `Parse the content of these PDFs and generate the following J - Do not infer, estimate, or guess any values. - If a value is missing or unclear, return null for that field. - Only values surrounded by parentheses in the tables should be considered negative. +- In FY 2023 and earlier, Service Departments Funds, Designated Funds, Auxiliary Expenses, and Restricted Funds are not grouped by school and are too long to parse. Thus these tables should be omitted, only for these years. +- In FY 2019 and earleir, some of the PDFs have been scanned in and thus many pages may be missing in the text extraction. If much or all but the preamble of a PDF is missing, exclude it from the output. Content of PDFs: From cbf58331de034fc57cbfcabbeca35ff0405f1d73 Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Mon, 25 May 2026 00:35:54 -0500 Subject: [PATCH 5/7] Remove token printing --- parser/budgetsParser.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index 4ef9522..40c1c05 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -447,10 +447,6 @@ func parseBudgetPdfs(paths []string) (schema.Budget, error) { // Get response result = response.Candidates[0].Content.Parts[0].Text - log.Print("Token counts:") - log.Printf("Prompt: %d", response.UsageMetadata.PromptTokenCount) - log.Printf("Thoughts: %d", response.UsageMetadata.ThoughtsTokenCount) - log.Printf("Total: %d", response.UsageMetadata.TotalTokenCount) // Set cache for next time err = utils.SetCache(hash, result, apiBucket) From 92b34b492dfabbbec58a05d950f0ecb16530cebc Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Mon, 25 May 2026 00:48:44 -0500 Subject: [PATCH 6/7] If error other than directory not existing, and we're not using backup budgets, panic --- parser/budgetsParser.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index 40c1c05..097c259 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -12,9 +12,11 @@ import ( "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "fmt" "io/fs" "log" + "os" "path/filepath" "reflect" "strings" @@ -337,7 +339,8 @@ func ParseBudgets(inDir string, outDir string, budgetsDir string, useBackupBudge } return nil }) - if err != nil { + // If error other than directory not existing, and we're not using backup budgets, panic + if err != nil && !(errors.Is(err, os.ErrNotExist) && useBackupBudgets) { panic(err) } From 331d38da11a5cd401e71c9d7a41f01172d4f41e4 Mon Sep 17 00:00:00 2001 From: Tyler Hill Date: Mon, 25 May 2026 17:28:33 -0500 Subject: [PATCH 7/7] Fix typo (renamed all hashes in bucket) --- parser/budgetsParser.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index 097c259..025a42f 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -253,7 +253,7 @@ var budgetPrompt = `Parse the content of these PDFs and generate the following J - If a value is missing or unclear, return null for that field. - Only values surrounded by parentheses in the tables should be considered negative. - In FY 2023 and earlier, Service Departments Funds, Designated Funds, Auxiliary Expenses, and Restricted Funds are not grouped by school and are too long to parse. Thus these tables should be omitted, only for these years. -- In FY 2019 and earleir, some of the PDFs have been scanned in and thus many pages may be missing in the text extraction. If much or all but the preamble of a PDF is missing, exclude it from the output. +- In FY 2019 and earlier, some of the PDFs have been scanned in and thus many pages may be missing in the text extraction. If much or all but the preamble of a PDF is missing, exclude it from the output. Content of PDFs: