diff --git a/go.mod b/go.mod index fe3a6ea..50d3739 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.26 require ( github.com/PuerkitoBio/goquery v1.12.0 - github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe + github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc github.com/chromedp/chromedp v0.15.1 github.com/dongri/phonenumber v0.1.12 diff --git a/go.sum b/go.sum index f9d36bf..2737f22 100644 --- a/go.sum +++ b/go.sum @@ -64,6 +64,10 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 h1:vp2hsJ github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244/go.mod h1:lp0oZHhVmqAqm0gf6Ald2jZXepZ0xFheTsW76T9wC7I= github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3Up3U7PKvWV7yyZ7ouvNd8081Zwmd4p5NFD3kk4= github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91 h1:KUwnKeedRHYncIcVYHMtXVmGUSp0LTxnbtO566GqC+c= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525024309-4ea6ee54dd91/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf h1:5IIliVrXFa8zyLrWkvK6Z5gtXVw8nrdsXWJkZqRONAU= +github.com/UTDNebula/nebula-api/api v0.0.0-20260525053158-3209b0868dcf/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= diff --git a/parser/academicCalendarsParser.go b/parser/academicCalendarsParser.go index 2ffb03e..8510fb6 100644 --- a/parser/academicCalendarsParser.go +++ b/parser/academicCalendarsParser.go @@ -27,6 +27,7 @@ import ( ) // What gets sent to Gemini, with the PDF content added +// WARNING: Changes to this prompt will invalidate all cached AI responses, only change if necessary var academicCalendarPrompt = `Parse this PDF content and generate the following JSON schema. { diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go index 708e20e..025a42f 100644 --- a/parser/budgetsParser.go +++ b/parser/budgetsParser.go @@ -12,9 +12,11 @@ import ( "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "fmt" "io/fs" "log" + "os" "path/filepath" "reflect" "strings" @@ -27,6 +29,7 @@ import ( ) // What gets sent to Gemini, with the PDF content added +// WARNING: Changes to this prompt will invalidate all cached AI responses, only change if necessary var budgetPrompt = `Parse the content of these PDFs and generate the following JSON schema. { @@ -242,13 +245,15 @@ var budgetPrompt = `Parse the content of these PDFs and generate the following J } } -- Use the full UTD school names in this title text: School of Arts, Humanities, and Technology; School of Behavioral and Brain Sciences; School of Economic, Political and Policy Sciences; School of Engineering and Computer Science; School of Interdisciplinary Studies; School of Management; School of Natural Sciences and Mathematics. +- Use the full UTD school names in this title-case text: School of Arts, Humanities, and Technology; School of Behavioral and Brain Sciences; School of Economic, Political and Policy Sciences; School of Engineering and Computer Science; School of Interdisciplinary Studies; School of Management; School of Natural Sciences and Mathematics. - In older years: School of Arts, Technology, and Emerging Communication; School of Arts & Humanities. - Replace Brian with Brain in the School of Behavioral and Brain Sciences name if it is misspelled in the PDF. - Always use the data listed for %s, not any previous years. - Do not infer, estimate, or guess any values. - If a value is missing or unclear, return null for that field. - Only values surrounded by parentheses in the tables should be considered negative. +- In FY 2023 and earlier, Service Departments Funds, Designated Funds, Auxiliary Expenses, and Restricted Funds are not grouped by school and are too long to parse. Thus these tables should be omitted, only for these years. +- In FY 2019 and earlier, some of the PDFs have been scanned in and thus many pages may be missing in the text extraction. If much or all but the preamble of a PDF is missing, exclude it from the output. Content of PDFs: @@ -334,7 +339,8 @@ func ParseBudgets(inDir string, outDir string, budgetsDir string, useBackupBudge } return nil }) - if err != nil { + // If error other than directory not existing, and we're not using backup budgets, panic + if err != nil && !(errors.Is(err, os.ErrNotExist) && useBackupBudgets) { panic(err) } @@ -444,10 +450,6 @@ func parseBudgetPdfs(paths []string) (schema.Budget, error) { // Get response result = response.Candidates[0].Content.Parts[0].Text - log.Print("Token counts:") - log.Printf("Prompt: %d", response.UsageMetadata.PromptTokenCount) - log.Printf("Thoughts: %d", response.UsageMetadata.ThoughtsTokenCount) - log.Printf("Total: %d", response.UsageMetadata.TotalTokenCount) // Set cache for next time err = utils.SetCache(hash, result, apiBucket) diff --git a/runners/monthly.sh b/runners/monthly.sh index c7983af..8c2db2d 100644 --- a/runners/monthly.sh +++ b/runners/monthly.sh @@ -8,6 +8,6 @@ ./api-tools -headless -verbose -upload -map # scrape, parse, and upload budgets -#./api-tools -headless -verbose -scrape -budgets -useBackupBudgets -#./api-tools -headless -verbose -parse -budgets -useBackupBudgets -#./api-tools -headless -verbose -upload -budgets -useBackupBudgets +./api-tools -headless -verbose -scrape -budgets -useBackupBudgets +./api-tools -headless -verbose -parse -budgets -useBackupBudgets +./api-tools -headless -verbose -upload -budgets -useBackupBudgets