diff --git a/.env.template b/.env.template index d0c456d..16d3c5f 100644 --- a/.env.template +++ b/.env.template @@ -14,6 +14,8 @@ NEBULA_API_URL= NEBULA_API_STORAGE_BUCKET= NEBULA_API_KEY= NEBULA_API_STORAGE_KEY= +#Budgets +NEBULA_API_BUDGET_STORAGE_BUCKET= # Uploader MONGODB_URI= diff --git a/README.md b/README.md index 0142f69..d78eae0 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,6 @@ Project maintained by [Nebula Labs](https://about.utdnebula.com). ### Design -#### - The `grade-data` directory contains .csv files of UTD grade data. - - Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively. - - This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester. - - This grade data is collected independently from the scrapers, and is used during the parsing process. #### - The `scrapers` directory contains the scrapers for various UTD data sources. This is where the data pipeline begins. - The scrapers are concerned solely with data collection, not necessarily validation or processing of said data. Those responsibilities are left to the parsing stage. #### - The `parser` directory contains the files and methods that parse the scraped data. This is the 'middle man' of the data pipeline. @@ -17,6 +13,13 @@ Project maintained by [Nebula Labs](https://about.utdnebula.com). - The input data is considered **immutable** by the parsing stage. This means the parsers should never modify the data being fed into them. #### - The `uploader` directory contains the uploader that sends the parsed data to the Nebula API MongoDB database. This is the final stage of the data pipeline. - The uploader(s) are concerned solely with pushing parsed data to the database. Data, at this point, is assumed to be valid and ready for use. +#### - The `static-data/grades` directory contains .csv files of UTD grade data. + - Files are named by year and semester, with a suffix of `S`, `U`, or `F` denoting Spring, Summer, and Fall semesters, respectively. + - This means that, for example, `22F.csv` corresponds to the 2022 Fall semester, whereas `18U.csv` corresponds with the 2018 Summer semester. + - This grade data is collected independently from the scrapers, and is used during the parsing process. +#### - The `static-data/budgets` directory contains .pdf files of UTD budget data. + - Files are named by fiscal year. + - This budget data is used as a backup of scraped data as some years have been removed from the website. ### Contributing @@ -79,7 +82,7 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r | `./api-tools -parse -academicCalendars` | Parses academic calendar PDFs. | | `./api-tools -parse -astra` | Parses Astra data. | | `./api-tools -parse -cometCalendar` | Parses Comet Calendar data. | -| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./grade-data`). | +| `./api-tools -parse -csv [directory]` | Outputs grade data CSVs (default: `./static-data/grades`). | | `./api-tools -parse -discounts` | Parses discount programs HTML. | | `./api-tools -parse -degrees` | Parses degrees from HTML. | | `./api-tools -parse -map` | Parses UTD Map data. | diff --git a/go.mod b/go.mod index c0c1a6c..fe3a6ea 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.26 require ( github.com/PuerkitoBio/goquery v1.12.0 - github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 + github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe github.com/chromedp/cdproto v0.0.0-20260321001828-e3e3800016bc github.com/chromedp/chromedp v0.15.1 github.com/dongri/phonenumber v0.1.12 diff --git a/go.sum b/go.sum index 57a9fbc..f9d36bf 100644 --- a/go.sum +++ b/go.sum @@ -62,6 +62,8 @@ github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052 h1:bN/JW1 github.com/UTDNebula/nebula-api/api v0.0.0-20260226225356-d9b9e35d3052/go.mod h1:vWwnuoXFE/Lo9yW6Z6DJguCtAHu0xMym+6r2IEru1v0= github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244 h1:vp2hsJiJwxpgYCTGd3hxWPQay7g7MvtYbLINDmN1+p4= github.com/UTDNebula/nebula-api/api v0.0.0-20260327185527-807066607244/go.mod h1:lp0oZHhVmqAqm0gf6Ald2jZXepZ0xFheTsW76T9wC7I= +github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe h1:/y+M3Up3U7PKvWV7yyZ7ouvNd8081Zwmd4p5NFD3kk4= +github.com/UTDNebula/nebula-api/api v0.0.0-20260501050907-0dea4acc1dfe/go.mod h1:i+PQZZ3qPtE4UxXkp3tQ46NWpzB8Of2/VNl0iZ/uv9I= github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= diff --git a/main.go b/main.go index 29f6603..a5288f5 100644 --- a/main.go +++ b/main.go @@ -50,10 +50,14 @@ func main() { academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.") // Flag for degree scraping and parsing degrees := flag.Bool("degrees", false, "Alongside -scrape, -parse, or -upload. Signifies that the degrees should be scraped/parsed/uploaded.") + // Flag for budget scraping + budgets := flag.Bool("budgets", false, "Alongside -scrape, -parse, or -upload, signifies that the budgets should be scraped/parsed/uploaded.") // Flags for parsing parse := flag.Bool("parse", false, "Puts the tool into parsing mode.") - csvDir := flag.String("csv", "./grade-data", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.") + gradesDir := flag.String("gradesDir", "./static-data/grades", "Alongside -parse, specifies the path to the directory of CSV files containing grade data.") + useBackupBudgets := flag.Bool("useBackupBudgets", false, "Alongside -parse, specifies that backup budget data should also be parsed.") + budgetsDir := flag.String("budgetsDir", "./static-data/budgets", "Alongside -parse, specifies the path to the directory of PDF files containing budget data.") skipValidation := flag.Bool("skipv", false, "Alongside -parse, signifies that the post-parsing validation should be skipped. Be careful with this!") // Flags for uploading data @@ -122,6 +126,8 @@ func main() { scrapers.ScrapeAcademicCalendars(*outDir) case *degrees: scrapers.ScrapeDegrees(*outDir) + case *budgets: + scrapers.ScrapeBudgets(*outDir) default: log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!") } @@ -141,8 +147,10 @@ func main() { parser.ParseDiscounts(*inDir, *outDir) case *degrees: parser.ParseDegrees(*inDir, *outDir) + case *budgets: + parser.ParseBudgets(*inDir, *outDir, *budgetsDir, *useBackupBudgets) default: - parser.Parse(*inDir, *outDir, *csvDir, *skipValidation) + parser.Parse(*inDir, *outDir, *gradesDir, *skipValidation) } case *upload: switch { @@ -156,6 +164,8 @@ func main() { uploader.UploadDiscounts(*inDir) case *degrees: uploader.UploadDegrees(*inDir) + case *budgets: + uploader.UploadBudgets(*inDir) default: uploader.Upload(*inDir, *replace, *staticOnly) } diff --git a/parser/academicCalendars.go b/parser/academicCalendarsParser.go similarity index 54% rename from parser/academicCalendars.go rename to parser/academicCalendarsParser.go index b2e4d3c..2ffb03e 100644 --- a/parser/academicCalendars.go +++ b/parser/academicCalendarsParser.go @@ -8,19 +8,15 @@ complicated installation process, or errored on one of the PDFs. package parser import ( - "bytes" "context" "crypto/sha256" "encoding/hex" "encoding/json" "fmt" - "io" "io/fs" "log" - "net/http" - "os" - "os/exec" "path/filepath" + "reflect" "strings" "sync" "time" @@ -30,12 +26,8 @@ import ( "google.golang.org/genai" ) -// Store client to only create once -var once sync.Once -var geminiClient *genai.Client - // What gets sent to Gemini, with the PDF content added -var prompt = `Parse this PDF content and generate the following JSON schema. +var academicCalendarPrompt = `Parse this PDF content and generate the following JSON schema. { _id: %s, @@ -99,14 +91,14 @@ func ParseAcademicCalendars(inDir string, outDir string) { for path := range jobs { log.Printf("Parsing %s...", filepath.Base(path)) - academicCalendar, err := parsePdf(path) + academicCalendar, err := parseAcademicCalendarPdf(path) if err != nil { if strings.Contains(err.Error(), "429") { // Exponential-ish backoff up to 60s for 429 rate limiting backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second} for _, delay := range backoffs { time.Sleep(delay) - academicCalendar, err = parsePdf(path) + academicCalendar, err = parseAcademicCalendarPdf(path) if err == nil || !strings.Contains(err.Error(), "429") { break } @@ -149,7 +141,12 @@ func ParseAcademicCalendars(inDir string, outDir string) { } // Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not -func parsePdf(path string) (schema.AcademicCalendar, error) { +func parseAcademicCalendarPdf(path string) (schema.AcademicCalendar, error) { + apiBucket, err := getAcademicCalendarBucket() + if err != nil { + return schema.AcademicCalendar{}, err + } + // "Fall 2025" to "25F" filename := filepath.Base(path) filename = filename[0 : len(filename)-4] @@ -165,18 +162,18 @@ func parsePdf(path string) (schema.AcademicCalendar, error) { timeline := filenameParts[0] // Read PDF - content, err := readPdf(path) + content, err := utils.ReadPdf(path, 1) if err != nil { return schema.AcademicCalendar{}, err } // Build prompt - promptFilled := fmt.Sprintf(prompt, name, timeline, content) + promptFilled := fmt.Sprintf(academicCalendarPrompt, name, timeline, content) // Check cache hashByte := sha256.Sum256([]byte(promptFilled)) hash := hex.EncodeToString(hashByte[:]) + ".json" - result, err := checkCache(hash) + result, err := utils.CheckCache(hash, apiBucket) if err != nil { return schema.AcademicCalendar{}, err } @@ -189,23 +186,30 @@ func parsePdf(path string) (schema.AcademicCalendar, error) { log.Printf("No cache for %s, asking Gemini.", filename) // AI - geminiClient := getGeminiClient() + geminiClient := utils.GetGeminiClient() + + // Response schema + calendarSchema := utils.StructToSchema(reflect.TypeOf(schema.AcademicCalendar{})) // Send request with default config response, err := geminiClient.Models.GenerateContent(context.Background(), "gemini-2.5-pro", genai.Text(promptFilled), - &genai.GenerateContentConfig{}, + // Enforce response schema + &genai.GenerateContentConfig{ + ResponseMIMEType: "application/json", + ResponseSchema: calendarSchema, + }, ) if err != nil { return schema.AcademicCalendar{}, err } - // Get response, remove backtick formatting if present - result = strings.ReplaceAll(strings.ReplaceAll(response.Candidates[0].Content.Parts[0].Text, "```json", ""), "```", "") + // Get response + result = response.Candidates[0].Content.Parts[0].Text // Set cache for next time - err = setCache(hash, result) + err = utils.SetCache(hash, result, apiBucket) if err != nil { return schema.AcademicCalendar{}, err } @@ -221,154 +225,11 @@ func parsePdf(path string) (schema.AcademicCalendar, error) { return academicCalendar, nil } -// Read the text from the first page of a PDF -// Using external program pdftotext -func readPdf(path string) (string, error) { - cmd := exec.Command("pdftotext", "-l", "1", "-raw", path, "-") - - var out bytes.Buffer - var stderr bytes.Buffer - cmd.Stdout = &out - cmd.Stderr = &stderr - - if err := cmd.Run(); err != nil { - return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String()) - } - - return out.String(), nil -} - -// Check cache for a response to the same prompt -func checkCache(hash string) (string, error) { - apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys() - if err != nil { - return "", err - } - - client := &http.Client{} - - // Make request - req, err := http.NewRequest("GET", apiUrl+"storage/"+apiBucket+"/"+hash, nil) - if err != nil { - return "", err - } - req.Header.Add("x-api-key", apiKey) - req.Header.Add("x-storage-key", apiStorageKey) - resp, err := client.Do(req) - if err != nil { - return "", err - } - defer resp.Body.Close() - - // Read the response body - body, err := io.ReadAll(resp.Body) - if err != nil { - return "", err - } - var parsedBody schema.APIResponse[schema.ObjectInfo] - err = json.Unmarshal([]byte(body), &parsedBody) - if err != nil { - // If this errors, return ("", nil) to indicate not found - return "", nil - } - - // Fetch object - req, err = http.NewRequest("GET", parsedBody.Data.MediaLink, nil) - if err != nil { - return "", err - } - resp, err = client.Do(req) - if err != nil { - return "", err - } - defer resp.Body.Close() - - // Read the response body - body, err = io.ReadAll(resp.Body) - if err != nil { - return "", err - } - - return string(body), nil -} - -// Upload AI response to cache -func setCache(hash string, result string) error { - apiUrl, apiBucket, apiKey, apiStorageKey, err := getNebulaKeys() - if err != nil { - return err - } - - // Make request - jsonStr := []byte(result) - bodyReader := bytes.NewBuffer(jsonStr) - req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader) - if err != nil { - return err - } - req.Header.Set("Content-Type", "application/json") - req.Header.Add("x-api-key", apiKey) - req.Header.Add("x-storage-key", apiStorageKey) - client := &http.Client{} - resp, err := client.Do(req) - if err != nil { - return err - } - defer resp.Body.Close() - - return nil -} - -// Get all the keys to access the Nebula API storage routes -func getNebulaKeys() (string, string, string, string, error) { - apiUrl, err := utils.GetEnv("NEBULA_API_URL") - if err != nil { - return "", "", "", "", err - } +// Get the storage bucket for the academic calendar cache +func getAcademicCalendarBucket() (string, error) { apiBucket, err := utils.GetEnv("NEBULA_API_STORAGE_BUCKET") if err != nil { - return "", "", "", "", err - } - apiKey, err := utils.GetEnv("NEBULA_API_KEY") - if err != nil { - return "", "", "", "", err - } - apiStorageKey, err := utils.GetEnv("NEBULA_API_STORAGE_KEY") - if err != nil { - return "", "", "", "", err + return "", err } - - return apiUrl, apiBucket, apiKey, apiStorageKey, nil -} - -// Create client only once -// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT -func getGeminiClient() *genai.Client { - once.Do(func() { - // Create JSON file - serviceAccount, err := utils.GetEnv("GEMINI_SERVICE_ACCOUNT") - if err != nil { - panic(err) - } - jsonFile, err := utils.GetEnv("GOOGLE_APPLICATION_CREDENTIALS") - if err != nil { - panic(err) - } - err = os.WriteFile(jsonFile, []byte(serviceAccount), 0644) - if err != nil { - panic(err) - } - - // Create client - geminiClient, err = genai.NewClient(context.Background(), - &genai.ClientConfig{ - Project: "api-tools-451421", - Location: "us-central1", - Backend: genai.BackendVertexAI, - }) - if err != nil { - panic(err) - } - }) - return geminiClient + return apiBucket, nil } diff --git a/parser/budgetsParser.go b/parser/budgetsParser.go new file mode 100644 index 0000000..708e20e --- /dev/null +++ b/parser/budgetsParser.go @@ -0,0 +1,476 @@ +/* +Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html +apt-get install -y poppler-utils +I found all the Go programs for PDF text extraction were all either paid, had a +complicated installation process, or errored on one of the PDFs. +*/ + +package parser + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io/fs" + "log" + "path/filepath" + "reflect" + "strings" + "sync" + "time" + + "github.com/UTDNebula/api-tools/utils" + "github.com/UTDNebula/nebula-api/api/schema" + "google.golang.org/genai" +) + +// What gets sent to Gemini, with the PDF content added +var budgetPrompt = `Parse the content of these PDFs and generate the following JSON schema. + +{ + _id: %s, + operating_budget: { // Data only from the Operating Budget file + operating_revenues: { + name: "Operating Revenues", // From the Operating Budget - Expenses by Functional Classification table + rows: [ + { + label: string, // Tuition and Fees (Gross), Less Discounts and Allowances, Federal Sponsored Programs, ... + value: number // Use the total from the latest FY (last column) + } + ], + total: number + }, + operating_expenses: { + name: "Operating Expenses", // Right under, from the same Operating Budget - Expenses by Functional Classification table + rows: [ + { + label: string, // Instruction, Academic Support, Research, ... + value: number // Use the total from the latest FY (last column) + } + ], + total: number + }, + budgeted_nonoperating_revenues: { + name: "Budgeted Nonoperating Revenues (Expenses)", // Right under, from the same Operating Budget - Expenses by Functional Classification table + rows: [ + { + label: string, // State Appropriations, Federal Sponsored Programs - Nonoperating, State/Local Sponsored Programs - Nonoperating, ... + value: number // Use the total from the latest FY (last column) + } + ], + total: number + }, + salaries_doe_and_instructional_admin: { + name: "Summary of Faculty Salaries, Departmental Operating Expenses, and Instructional Administration", + rows: [ + { + label: string, // Provost and V.P. Academic Affairs, each school, Other Instructional Support + value: { // Use the values from the latest FY (last 4 columns) + total: number, + faculty_salaries: number, + departmental_operating_expenses: number, + instructional_administration: number + } + } + ], + total: { + total: number, + faculty_salaries: number, + departmental_operating_expenses: number, + instructional_administration: number + } + }, + service_departments_funds: { + name: "Service Departments and Revolving Funds", // In the Service Department Funds section + rows: [ + { + name: string, // Sub tables by school and other categories + rows: [ + { + label: string, + value: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + }, + designated_funds: { + name: "Designated Funds", // In the Designated Funds section + rows: [ + { + name: string, // Sub tables by school and other categories + rows: [ + { + label: string, + value: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + }, + budgeted_tuition_and_student_fees: { + name: "Budgeted Tuition and Student Fees", // In the Designated Funds section + rows: [ + { + name: string, // Tuition, Laboratory & Supplemental Fees, Mandatory Student Fees, Program, Course Related & Other Incidental Fees + rows: [ + { + label: string, // Tuition, Tuition Differential Exemption; Laboratory Fees Excessive Hours, Three Peat Fee; Advising Fee, Athletic Program Fee, Information Technology Fee; Application Fee; Bursar Fees, Late Fees; Chec Collin County + value: number // Use the total from the latest FY (last column) + } + ], + total: number + } + ], + total: number + }, + auxiliary_expenses: { + name: "Auxiliary Expenses", // In the Auxiliary Enterprises Funds section + rows: [ + { + name: string, // Sub tables by school and other categories including Facilities and Economic Dev, Student Affairs, ... + rows: [ + { + label: string, + value: { + estimated_income: number, + budgeted_expenses: number, + debt_service: number, + other: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number, + debt_service: number, + other: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number, + debt_service: number, + other: number + } + }, + restricted_funds: { + name: "Restricted Funds", // In the Restricted Gift Funds section, Endowments table + rows: [ + { + name: string, // Sub tables by school and other categories + rows: [ + { + label: string, + value: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + } + ], + total: { + estimated_income: number, + budgeted_expenses: number + } + } + }, + annual_financial_report: { // Data only from the Annual Financial Report file + // All from the Exhibit B Statement of Revenues, Expenses, and Changes in Net Position table + operating_revenues: { + name: "Operating Revenues", + rows: [ + { + label: string, // Student Tuition and Fees, Discounts and Allowances, Federal Sponsored Programs, ... + value: number // Use the total from the latest FY (first column) + } + ], + total: number + }, + operating_expenses: { + name: "Operating Expenses", + rows: [ + { + label: string, // Instruction, Research, Public Service, ... + value: number // Use the total from the latest FY (first column) + } + ], + total: number + }, + nonoperating_revenues: { + name: "Nonoperating Revenues (Expenses)", + rows: [ + { + label: string, // State Appropriations, Federal Nonexchange Sponsored Programs, Federal Nonexchange Pass-Through, ... + value: number // Use the total from the latest FY (first column) + } + ], + total: number + }, + beginning_net_position: number, + ending_net_position: number + } +} + +- Use the full UTD school names in this title text: School of Arts, Humanities, and Technology; School of Behavioral and Brain Sciences; School of Economic, Political and Policy Sciences; School of Engineering and Computer Science; School of Interdisciplinary Studies; School of Management; School of Natural Sciences and Mathematics. + - In older years: School of Arts, Technology, and Emerging Communication; School of Arts & Humanities. + - Replace Brian with Brain in the School of Behavioral and Brain Sciences name if it is misspelled in the PDF. +- Always use the data listed for %s, not any previous years. +- Do not infer, estimate, or guess any values. +- If a value is missing or unclear, return null for that field. +- Only values surrounded by parentheses in the tables should be considered negative. + +Content of PDFs: + +%s` + +func ParseBudgets(inDir string, outDir string, budgetsDir string, useBackupBudgets bool) { + // Get sub folder from output folder + inSubDir := filepath.Join(inDir, "budgets") + + result := []schema.Budget{} + + // Parallel requests + numWorkers := 10 + jobs := make(chan []string) + var wg sync.WaitGroup + var mu sync.Mutex + + // Start worker goroutines + for range numWorkers { + wg.Add(1) + go func() { + defer wg.Done() + for paths := range jobs { + year := filepath.Base(filepath.Dir(paths[0])) + log.Printf("Parsing %s...", year) + + budget, err := parseBudgetPdfs(paths) + if err != nil { + if strings.Contains(err.Error(), "429") { + // Exponential-ish backoff up to 60s for 429 rate limiting + backoffs := []time.Duration{20 * time.Second, 40 * time.Second, 60 * time.Second} + for _, delay := range backoffs { + time.Sleep(delay) + budget, err = parseBudgetPdfs(paths) + if err == nil || !strings.Contains(err.Error(), "429") { + break + } + } + } + + if err != nil { + panic(err) + } + } + + mu.Lock() + result = append(result, budget) + mu.Unlock() + + log.Printf("Parsed %s!", year) + } + }() + } + + yearsScraped := make(map[string]bool) + + // Traverse scraped directory, storing years found + err := filepath.WalkDir(inSubDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + // Skip the root "budgets" directory itself + if path == inSubDir { + return nil + } + if d.IsDir() { // Is a folder + year := filepath.Base(path) + yearsScraped[year] = true + var files []string + err := filepath.WalkDir(path, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() { // Is a file + files = append(files, path) + } + return nil + }) + if err != nil { + return err + } + jobs <- files + } + return nil + }) + if err != nil { + panic(err) + } + + // If we're including the backup budgets, since UTD has removed some older PDFs from the website + if useBackupBudgets { + // Traverse backup directory, only adding jobs for years not found in scraped directory + err = filepath.WalkDir(budgetsDir, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + // Skip the root "budgets" directory itself + if path == budgetsDir { + return nil + } + if d.IsDir() { // Is a folder + year := filepath.Base(path) + if !yearsScraped[year] { + var files []string + err := filepath.WalkDir(path, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if !d.IsDir() { // Is a file + files = append(files, path) + } + return nil + }) + if err != nil { + return err + } + jobs <- files + } + } + return nil + }) + if err != nil { + panic(err) + } + } + + close(jobs) + + // Wait for workers to finish + wg.Wait() + + utils.WriteJSON(fmt.Sprintf("%s/budgets.json", outDir), result) +} + +// Read a PDF, build a prompt for Gemini to parse it, check if it has already been asked in the cache, and ask Gemini if not +func parseBudgetPdfs(paths []string) (schema.Budget, error) { + apiBucket, err := getBudgetBucket() + if err != nil { + return schema.Budget{}, err + } + + year := filepath.Base(filepath.Dir(paths[0])) + + // Read PDFs + var contentBuilder strings.Builder + for _, path := range paths { + content, err := utils.ReadPdf(path, -1) + if err != nil { + return schema.Budget{}, err + } + contentBuilder.WriteString("# " + filepath.Base(path) + "\n\n") + contentBuilder.WriteString(content + "\n\n\n") + } + content := contentBuilder.String() + + // Build prompt + promptFilled := fmt.Sprintf(budgetPrompt, year, year, content) + + // Check cache + hashByte := sha256.Sum256([]byte(promptFilled)) + hash := hex.EncodeToString(hashByte[:]) + ".json" + result, err := utils.CheckCache(hash, apiBucket) + if err != nil { + return schema.Budget{}, err + } + + // Skip AI if cache found + if result != "" { + log.Printf("Cache found for %s!", year) + } else { + // Cache not found + log.Printf("No cache for %s, asking Gemini.", year) + + // AI + geminiClient := utils.GetGeminiClient() + + // Response schema + budgetSchema := utils.StructToSchema(reflect.TypeOf(schema.Budget{})) + + // Send request with default config + response, err := geminiClient.Models.GenerateContent(context.Background(), + "gemini-2.5-pro", + genai.Text(promptFilled), + // Enforce response schema + &genai.GenerateContentConfig{ + ResponseMIMEType: "application/json", + ResponseSchema: budgetSchema, + }, + ) + if err != nil { + return schema.Budget{}, err + } + + // Get response + result = response.Candidates[0].Content.Parts[0].Text + log.Print("Token counts:") + log.Printf("Prompt: %d", response.UsageMetadata.PromptTokenCount) + log.Printf("Thoughts: %d", response.UsageMetadata.ThoughtsTokenCount) + log.Printf("Total: %d", response.UsageMetadata.TotalTokenCount) + + // Set cache for next time + err = utils.SetCache(hash, result, apiBucket) + if err != nil { + return schema.Budget{}, err + } + } + + // Build struct + var budget schema.Budget + err = json.Unmarshal([]byte(result), &budget) + if err != nil { + return schema.Budget{}, err + } + + return budget, nil +} + +// Get the storage bucket for the budget cache +func getBudgetBucket() (string, error) { + apiBucket, err := utils.GetEnv("NEBULA_API_BUDGET_STORAGE_BUCKET") + if err != nil { + return "", err + } + return apiBucket, nil +} diff --git a/parser/gradeLoader_test.go b/parser/gradeLoader_test.go index efb3e1c..fbc8156 100644 --- a/parser/gradeLoader_test.go +++ b/parser/gradeLoader_test.go @@ -91,7 +91,7 @@ func TestLoadGrades(t *testing.T) { } t.Run("Real_Data", func(t *testing.T) { - _, err := loadGrades("../grade-data/") + _, err := loadGrades("../static-data/grades/") if err != nil { t.Errorf("failed to load grades: %v", err) } diff --git a/parser/parser_test.go b/parser/parser_test.go index 41e7af4..c9e3da2 100644 --- a/parser/parser_test.go +++ b/parser/parser_test.go @@ -132,7 +132,7 @@ func updateTestData() error { } defer os.RemoveAll(tempDir) - GradeMap, err = loadGrades("../grade-data") + GradeMap, err = loadGrades("../static-data/grades") if err != nil { return err } @@ -217,7 +217,7 @@ func updateTestData() error { //rerun parser to get Courses.json, Sections.json, Professors.json - Parse(tempDir, tempDir, "../grade-data", false) + Parse(tempDir, tempDir, "../static-data/grades", false) targetDir := "testdata" @@ -294,7 +294,7 @@ func clearGlobals() { func TestParse(t *testing.T) { tempDir := t.TempDir() - Parse("testdata", tempDir, "../grade-data", false) + Parse("testdata", tempDir, "../static-data/grades", false) OutputCourses, err := unmarshallFile[[]schema.Course](filepath.Join(tempDir, "courses.json")) if err != nil { diff --git a/runners/monthly.sh b/runners/monthly.sh index 7995e13..8c2db2d 100644 --- a/runners/monthly.sh +++ b/runners/monthly.sh @@ -6,3 +6,8 @@ ./api-tools -headless -verbose -scrape -map ./api-tools -headless -verbose -parse -map ./api-tools -headless -verbose -upload -map + +# scrape, parse, and upload budgets +./api-tools -headless -verbose -scrape -budgets -useBackupBudgets +./api-tools -headless -verbose -parse -budgets -useBackupBudgets +./api-tools -headless -verbose -upload -budgets -useBackupBudgets diff --git a/runners/weekly.sh b/runners/weekly.sh index 401e688..6eccb74 100644 --- a/runners/weekly.sh +++ b/runners/weekly.sh @@ -15,4 +15,4 @@ # scrape, parse, and upload degrees ./api-tools -headless -verbose -scrape -degrees ./api-tools -headless -verbose -parse -degrees -./api-tools -headless -verbose -upload -degrees \ No newline at end of file +./api-tools -headless -verbose -upload -degrees diff --git a/scrapers/adacemicCalendars.go b/scrapers/academicCalendars.go similarity index 76% rename from scrapers/adacemicCalendars.go rename to scrapers/academicCalendars.go index a26c4c1..df51b6d 100644 --- a/scrapers/adacemicCalendars.go +++ b/scrapers/academicCalendars.go @@ -5,7 +5,6 @@ package scrapers import ( - "context" "fmt" "io" "log" @@ -83,8 +82,14 @@ func ScrapeAcademicCalendars(outDir string) { if err != nil { panic(err) } - newCalendars := extractTextAndHref(futureNodes, "future", chromedpCtx) - academicCalendars = append(academicCalendars, newCalendars...) + links := utils.ExtractTextAndHref(futureNodes, chromedpCtx) + for _, link := range links { + academicCalendars = append(academicCalendars, AcademicCalendar{ + Title: link.Text, + Href: link.Href, + Time: "future", + }) + } // Past list var pastNodes []*cdp.Node @@ -94,8 +99,14 @@ func ScrapeAcademicCalendars(outDir string) { if err != nil { panic(err) } - newCalendars = extractTextAndHref(pastNodes, "past", chromedpCtx) - academicCalendars = append(academicCalendars, newCalendars...) + links = utils.ExtractTextAndHref(pastNodes, chromedpCtx) + for _, link := range links { + academicCalendars = append(academicCalendars, AcademicCalendar{ + Title: link.Text, + Href: link.Href, + Time: "past", + }) + } // Don't need ChromeDP anymore cancel() @@ -110,32 +121,6 @@ func ScrapeAcademicCalendars(outDir string) { } } -func extractTextAndHref(nodes []*cdp.Node, time string, chromedpCtx context.Context) []AcademicCalendar { - output := []AcademicCalendar{} - var err error - - // Extract href and text - for _, n := range nodes { - var href, text string - // Get href attribute - for i := 0; i < len(n.Attributes); i += 2 { - if n.Attributes[i] == "href" { - href = n.Attributes[i+1] - } - } - // Get inner text - err = chromedp.Run(chromedpCtx, - chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery), - ) - if err != nil { - panic(err) - } - output = append(output, AcademicCalendar{text, href, time}) - } - - return output -} - func downloadPdfFromBox(href string, filename string, outDir string) { // Create blank file out, err := os.Create(filepath.Join(outDir, fmt.Sprintf("%s.pdf", filename))) diff --git a/scrapers/budgets.go b/scrapers/budgets.go new file mode 100644 index 0000000..6fde3f9 --- /dev/null +++ b/scrapers/budgets.go @@ -0,0 +1,154 @@ +/* + This file contains the code for the budgets scaper. +*/ + +package scrapers + +import ( + "fmt" + "io" + "log" + "net/http" + "os" + "path/filepath" + "regexp" + "time" + + "github.com/UTDNebula/api-tools/utils" + "github.com/chromedp/cdproto/cdp" + "github.com/chromedp/chromedp" +) + +type Budget struct { + Title string + Href string +} + +func ScrapeBudgets(outDir string) { + // Start chromedp + chromedpCtx, cancel := utils.InitChromeDp() + + // Get sub folder from output folder + outSubDir := filepath.Join(outDir, "budgets") + + // Make output folder + os.RemoveAll(outSubDir) + err := os.MkdirAll(outSubDir, 0777) + if err != nil { + panic(err) + } + + // Go to listings page + _, err = chromedp.RunResponse(chromedpCtx, + chromedp.Navigate(`https://finance.utdallas.edu/for-others/public-reports/`), + ) + if err != nil { + panic(err) + } + + // Selector for the scraping the budget nodes + financialReportSel := `//h2[normalize-space(text())="Annual Financial Statements"]/following-sibling::ul[1]//a` + budgetReportSel := `//h2[normalize-space(text())="Annual Budget Reports"]/following-sibling::div[1]//details//ul/li[1]/a` + + budgets := []Budget{} + + // Extract data from links + // Annual Financial Statements + var financialReportNodes []*cdp.Node + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(financialReportSel, &financialReportNodes, chromedp.BySearch), + ) + if err != nil { + panic(err) + } + links := utils.ExtractTextAndHref(financialReportNodes, chromedpCtx) + for _, link := range links { + budgets = append(budgets, Budget{ + Title: link.Text, + Href: link.Href, + }) + } + + // Annual Financial Statements + var budgetReportNodes []*cdp.Node + err = chromedp.Run(chromedpCtx, + chromedp.Nodes(budgetReportSel, &budgetReportNodes, chromedp.BySearch), + ) + if err != nil { + panic(err) + } + links = utils.ExtractTextAndHref(budgetReportNodes, chromedpCtx) + for _, link := range links { + budgets = append(budgets, Budget{ + Title: link.Text, + Href: link.Href, + }) + } + + // Don't need ChromeDP anymore + cancel() + + // Download all PDFs + for _, budget := range budgets { + downloadPdf( + budget.Href, + budget.Title, + outSubDir, + ) + } +} + +func downloadPdf(href string, filename string, outDir string) { + client := &http.Client{ + Timeout: 30 * time.Second, + } + + req, err := http.NewRequest("GET", href, nil) + if err != nil { + panic(err) + } + + // Use a user agent and referer to avoid 599 errors + req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0") + req.Header.Set("Referer", "https://finance.utdallas.edu/for-others/public-reports/") + + resp, err := client.Do(req) + if err != nil { + panic(err) + } + defer resp.Body.Close() + + // Check response + if resp.StatusCode != http.StatusOK { + panic(fmt.Errorf("failed to download \"%s\": status code %d", filename, resp.StatusCode)) + } + + // Get sub folder from output folder + re := regexp.MustCompile(`FY\d{2}`) + match := re.FindString(filename) + outYearDir := filepath.Join(outDir, "20"+match[len(match)-2:]) + + // Make output folder + err = os.MkdirAll(outYearDir, 0777) + if err != nil { + panic(err) + } + + // Create blank file + out, err := os.Create(filepath.Join(outYearDir, fmt.Sprintf("%s.pdf", filename))) + if err != nil { + panic(err) + } + defer out.Close() + + // Output response to blank file + _, err = io.Copy(out, resp.Body) + if err != nil { + log.Printf("Error saving %s: %v", filename, err) + return + } + + log.Printf("Scraped budget %s!", filename) + + time.Sleep(1 * time.Second) +} diff --git a/static-data/budgets/2016/AFR FY16 Exhibits A-C (pdf).pdf b/static-data/budgets/2016/AFR FY16 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..72a3f96 Binary files /dev/null and b/static-data/budgets/2016/AFR FY16 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2016/FY16 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2016/FY16 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..55ddd9f Binary files /dev/null and b/static-data/budgets/2016/FY16 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2017/AFR FY17 Exhibits A-C (pdf).pdf b/static-data/budgets/2017/AFR FY17 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..70d8113 Binary files /dev/null and b/static-data/budgets/2017/AFR FY17 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2018/AFR FY18 Exhibits A-C (pdf).pdf b/static-data/budgets/2018/AFR FY18 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..5edc3a7 Binary files /dev/null and b/static-data/budgets/2018/AFR FY18 Exhibits A-C (pdf).pdf differ diff --git "a/static-data/budgets/2018/FY18 UT Dallas Operating Budget\302\240(pdf).pdf" "b/static-data/budgets/2018/FY18 UT Dallas Operating Budget\302\240(pdf).pdf" new file mode 100644 index 0000000..388c307 Binary files /dev/null and "b/static-data/budgets/2018/FY18 UT Dallas Operating Budget\302\240(pdf).pdf" differ diff --git a/static-data/budgets/2019/AFR FY19 Exhibits A-C (pdf).pdf b/static-data/budgets/2019/AFR FY19 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..3df9e33 Binary files /dev/null and b/static-data/budgets/2019/AFR FY19 Exhibits A-C (pdf).pdf differ diff --git "a/static-data/budgets/2019/FY19 UT Dallas Operating Budget\302\240(pdf).pdf" "b/static-data/budgets/2019/FY19 UT Dallas Operating Budget\302\240(pdf).pdf" new file mode 100644 index 0000000..0e7ac08 Binary files /dev/null and "b/static-data/budgets/2019/FY19 UT Dallas Operating Budget\302\240(pdf).pdf" differ diff --git a/static-data/budgets/2020/AFR FY20 Exhibits A-C (pdf).pdf b/static-data/budgets/2020/AFR FY20 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..a147586 Binary files /dev/null and b/static-data/budgets/2020/AFR FY20 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2020/FY20 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2020/FY20 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..cb402e5 Binary files /dev/null and b/static-data/budgets/2020/FY20 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2021/AFR FY21 Exhibits A-C (pdf).pdf b/static-data/budgets/2021/AFR FY21 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..6fc868a Binary files /dev/null and b/static-data/budgets/2021/AFR FY21 Exhibits A-C (pdf).pdf differ diff --git "a/static-data/budgets/2021/FY21 UT Dallas Operating Budget (pdf)\302\240.pdf" "b/static-data/budgets/2021/FY21 UT Dallas Operating Budget (pdf)\302\240.pdf" new file mode 100644 index 0000000..4c8fc98 Binary files /dev/null and "b/static-data/budgets/2021/FY21 UT Dallas Operating Budget (pdf)\302\240.pdf" differ diff --git "a/static-data/budgets/2022/AFR FY22 Exhibits A-C\302\240(pdf).pdf" "b/static-data/budgets/2022/AFR FY22 Exhibits A-C\302\240(pdf).pdf" new file mode 100644 index 0000000..bcc2351 Binary files /dev/null and "b/static-data/budgets/2022/AFR FY22 Exhibits A-C\302\240(pdf).pdf" differ diff --git a/static-data/budgets/2022/FY22 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2022/FY22 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..f5706d9 Binary files /dev/null and b/static-data/budgets/2022/FY22 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2023/AFR FY23 Exhibits A-C (pdf).pdf b/static-data/budgets/2023/AFR FY23 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..2a82e1f Binary files /dev/null and b/static-data/budgets/2023/AFR FY23 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2023/FY23 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2023/FY23 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..0057780 Binary files /dev/null and b/static-data/budgets/2023/FY23 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2024/AFR FY24 Exhibits A-C (pdf).pdf b/static-data/budgets/2024/AFR FY24 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..0a107db Binary files /dev/null and b/static-data/budgets/2024/AFR FY24 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2024/FY24 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2024/FY24 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..289a311 Binary files /dev/null and b/static-data/budgets/2024/FY24 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2025/AFR FY25 Exhibits A-C (pdf).pdf b/static-data/budgets/2025/AFR FY25 Exhibits A-C (pdf).pdf new file mode 100644 index 0000000..f7c1500 Binary files /dev/null and b/static-data/budgets/2025/AFR FY25 Exhibits A-C (pdf).pdf differ diff --git a/static-data/budgets/2025/FY25 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2025/FY25 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..37d2287 Binary files /dev/null and b/static-data/budgets/2025/FY25 UT Dallas Operating Budget (pdf).pdf differ diff --git a/static-data/budgets/2026/FY26 UT Dallas Operating Budget (pdf).pdf b/static-data/budgets/2026/FY26 UT Dallas Operating Budget (pdf).pdf new file mode 100644 index 0000000..21e6c07 Binary files /dev/null and b/static-data/budgets/2026/FY26 UT Dallas Operating Budget (pdf).pdf differ diff --git a/grade-data/17F.csv b/static-data/grades/17F.csv similarity index 100% rename from grade-data/17F.csv rename to static-data/grades/17F.csv diff --git a/grade-data/18F.csv b/static-data/grades/18F.csv similarity index 100% rename from grade-data/18F.csv rename to static-data/grades/18F.csv diff --git a/grade-data/18S.csv b/static-data/grades/18S.csv similarity index 100% rename from grade-data/18S.csv rename to static-data/grades/18S.csv diff --git a/grade-data/18U.csv b/static-data/grades/18U.csv similarity index 100% rename from grade-data/18U.csv rename to static-data/grades/18U.csv diff --git a/grade-data/19F.csv b/static-data/grades/19F.csv similarity index 100% rename from grade-data/19F.csv rename to static-data/grades/19F.csv diff --git a/grade-data/19S.csv b/static-data/grades/19S.csv similarity index 100% rename from grade-data/19S.csv rename to static-data/grades/19S.csv diff --git a/grade-data/19U.csv b/static-data/grades/19U.csv similarity index 100% rename from grade-data/19U.csv rename to static-data/grades/19U.csv diff --git a/grade-data/20F.csv b/static-data/grades/20F.csv similarity index 100% rename from grade-data/20F.csv rename to static-data/grades/20F.csv diff --git a/grade-data/20S.csv b/static-data/grades/20S.csv similarity index 100% rename from grade-data/20S.csv rename to static-data/grades/20S.csv diff --git a/grade-data/20U.csv b/static-data/grades/20U.csv similarity index 100% rename from grade-data/20U.csv rename to static-data/grades/20U.csv diff --git a/grade-data/21F.csv b/static-data/grades/21F.csv similarity index 100% rename from grade-data/21F.csv rename to static-data/grades/21F.csv diff --git a/grade-data/21S.csv b/static-data/grades/21S.csv similarity index 100% rename from grade-data/21S.csv rename to static-data/grades/21S.csv diff --git a/grade-data/21U.csv b/static-data/grades/21U.csv similarity index 100% rename from grade-data/21U.csv rename to static-data/grades/21U.csv diff --git a/grade-data/22F.csv b/static-data/grades/22F.csv similarity index 100% rename from grade-data/22F.csv rename to static-data/grades/22F.csv diff --git a/grade-data/22S.csv b/static-data/grades/22S.csv similarity index 100% rename from grade-data/22S.csv rename to static-data/grades/22S.csv diff --git a/grade-data/22U.csv b/static-data/grades/22U.csv similarity index 100% rename from grade-data/22U.csv rename to static-data/grades/22U.csv diff --git a/grade-data/23F.csv b/static-data/grades/23F.csv similarity index 100% rename from grade-data/23F.csv rename to static-data/grades/23F.csv diff --git a/grade-data/23S.csv b/static-data/grades/23S.csv similarity index 100% rename from grade-data/23S.csv rename to static-data/grades/23S.csv diff --git a/grade-data/23U.csv b/static-data/grades/23U.csv similarity index 100% rename from grade-data/23U.csv rename to static-data/grades/23U.csv diff --git a/grade-data/24F.csv b/static-data/grades/24F.csv similarity index 100% rename from grade-data/24F.csv rename to static-data/grades/24F.csv diff --git a/grade-data/24S.csv b/static-data/grades/24S.csv similarity index 100% rename from grade-data/24S.csv rename to static-data/grades/24S.csv diff --git a/grade-data/24U.csv b/static-data/grades/24U.csv similarity index 100% rename from grade-data/24U.csv rename to static-data/grades/24U.csv diff --git a/grade-data/25F.csv b/static-data/grades/25F.csv similarity index 100% rename from grade-data/25F.csv rename to static-data/grades/25F.csv diff --git a/grade-data/25S.csv b/static-data/grades/25S.csv similarity index 100% rename from grade-data/25S.csv rename to static-data/grades/25S.csv diff --git a/grade-data/25U.csv b/static-data/grades/25U.csv similarity index 100% rename from grade-data/25U.csv rename to static-data/grades/25U.csv diff --git a/uploader/budgetsUploader.go b/uploader/budgetsUploader.go new file mode 100644 index 0000000..f01d780 --- /dev/null +++ b/uploader/budgetsUploader.go @@ -0,0 +1,49 @@ +/* + This file is responsible for handling uploading of parsed budget data to MongoDB. +*/ + +package uploader + +import ( + "context" + "fmt" + "log" + "os" + "time" + + "github.com/UTDNebula/nebula-api/api/schema" + "github.com/joho/godotenv" +) + +// Note that this uploader assumes that the collection names match the names of these files, which they should. +// If the names of these collections ever change, the file names should be updated accordingly. + +var budgetsFile = "budgets.json" + +func UploadBudgets(inDir string) { + + //Load env vars + if err := godotenv.Load(); err != nil { + log.Panic("Error loading .env file") + } + + //Connect to mongo + client := connectDB() + + // Get 5 minute context + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + // Open data file for reading + fptr, err := os.Open(fmt.Sprintf("%s/"+budgetsFile, inDir)) + if err != nil { + if os.IsNotExist(err) { + log.Panicf("File not found. Skipping %s", budgetsFile) + } + log.Panic(err) + } + + defer fptr.Close() + + UploadData[schema.Budget](client, ctx, fptr, false) +} diff --git a/uploader/uploader.go b/uploader/uploader.go index d2f2ae4..1c524a1 100644 --- a/uploader/uploader.go +++ b/uploader/uploader.go @@ -17,6 +17,7 @@ import ( "github.com/UTDNebula/api-tools/uploader/pipelines" "github.com/UTDNebula/nebula-api/api/schema" + "go.mongodb.org/mongo-driver/bson/primitive" ) // It's important to note that all of the files must be updated/uploaded TOGETHER! @@ -174,68 +175,71 @@ func UploadData[T any](client *mongo.Client, ctx context.Context, fptr *os.File, } } else { - log.Panicf("Uploading without the -replace flag is not currently supported.") - /* - // If a temp collection already exists, drop it - tempCollection := getCollection(client, "temp") - err = tempCollection.Drop(ctx) - if err != nil { - log.Panic(err) - } + if fileName != "budgets" { + log.Panicf("Uploading without the -replace flag is not currently supported for anything but budgets.") + } - // Create a temporary collection - err := client.Database("combinedDB").CreateCollection(ctx, "temp") - if err != nil { - log.Panic(err) - } + // If a temp collection already exists, drop it + tempCollection := getCollection(client, "temp") + err = tempCollection.Drop(ctx) + if err != nil { + log.Panic(err) + } - // Get the temporary collection - tempCollection = getCollection(client, "temp") + // Create a temporary collection + err := client.Database("combinedDB").CreateCollection(ctx, "temp") + if err != nil { + log.Panic(err) + } - // Convert your documents to []interface{} - docsInterface := make([]interface{}, len(docs)) - for i := range docs { - docsInterface[i] = docs[i] - } + // Get the temporary collection + tempCollection = getCollection(client, "temp") - // Add all documents decoded from the file into the temporary collection - opts := options.InsertMany().SetOrdered(false) - _, err = tempCollection.InsertMany(ctx, docsInterface, opts) - if err != nil { - log.Panic(err) - } + // Convert your documents to []interface{} + docsInterface := make([]interface{}, len(docs)) + for i := range docs { + docsInterface[i] = docs[i] + } - // Create a merge aggregate pipeline - // Matched documents from the temporary collection will replace matched documents from the Mongo collection - // Unmatched documents from the temporary collection will be inserted into the Mongo collection - var matchFilters []string - switch fileName { - case "courses": - matchFilters = []string{"catalog_year", "course_number", "subject_prefix"} - case "professors": - matchFilters = []string{"first_name", "last_name"} - case "sections": - matchFilters = []string{"section_number", "course_reference", "academic_session"} - default: - log.Panic("Unrecognizable filename: " + fileName) - } + // Add all documents decoded from the file into the temporary collection + opts := options.InsertMany().SetOrdered(false) + _, err = tempCollection.InsertMany(ctx, docsInterface, opts) + if err != nil { + log.Panic(err) + } - // The documents will be added/merged into the collection with the same name as the file - // The filters for the merge aggregate pipeline are based on the file name - mergeStage := bson.D{primitive.E{Key: "$merge", Value: bson.D{primitive.E{Key: "into", Value: fileName}, primitive.E{Key: "on", Value: matchFilters}, primitive.E{Key: "whenMatched", Value: "replace"}, primitive.E{Key: "whenNotMatched", Value: "insert"}}}} + // Create a merge aggregate pipeline + // Matched documents from the temporary collection will replace matched documents from the Mongo collection + // Unmatched documents from the temporary collection will be inserted into the Mongo collection + var matchFilters []string + switch fileName { + case "courses": + matchFilters = []string{"catalog_year", "course_number", "subject_prefix"} + case "professors": + matchFilters = []string{"first_name", "last_name"} + case "sections": + matchFilters = []string{"section_number", "course_reference", "academic_session"} + case "budgets": + matchFilters = []string{"_id"} + default: + log.Panic("Unrecognizable filename: " + fileName) + } - // Execute aggregate pipeline - _, err = tempCollection.Aggregate(ctx, mongo.Pipeline{mergeStage}) - if err != nil { - log.Panic(err) - } + // The documents will be added/merged into the collection with the same name as the file + // The filters for the merge aggregate pipeline are based on the file name + mergeStage := bson.D{primitive.E{Key: "$merge", Value: bson.D{primitive.E{Key: "into", Value: fileName}, primitive.E{Key: "on", Value: matchFilters}, primitive.E{Key: "whenMatched", Value: "replace"}, primitive.E{Key: "whenNotMatched", Value: "insert"}}}} - // Drop the temporary collection - err = tempCollection.Drop(ctx) - if err != nil { - log.Panic(err) - } - */ + // Execute aggregate pipeline + _, err = tempCollection.Aggregate(ctx, mongo.Pipeline{mergeStage}) + if err != nil { + log.Panic(err) + } + + // Drop the temporary collection + err = tempCollection.Drop(ctx) + if err != nil { + log.Panic(err) + } } log.Println("Done uploading " + fileName + ".json!") diff --git a/utils/methods.go b/utils/methods.go index 70b8bfe..addb000 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -321,3 +321,34 @@ func ConvertFromInterface[T string | float64](value any) *T { } return nil } + +type LinkResult struct { + Text string + Href string +} + +func ExtractTextAndHref(nodes []*cdp.Node, chromedpCtx context.Context) []LinkResult { + output := []LinkResult{} + var err error + + // Extract href and text + for _, n := range nodes { + var href, text string + // Get href attribute + for i := 0; i < len(n.Attributes); i += 2 { + if n.Attributes[i] == "href" { + href = n.Attributes[i+1] + } + } + // Get inner text + err = chromedp.Run(chromedpCtx, + chromedp.TextContent(fmt.Sprintf(`a[href="%s"]`, href), &text, chromedp.ByQuery), + ) + if err != nil { + panic(err) + } + output = append(output, LinkResult{text, href}) + } + + return output +} diff --git a/utils/parser.go b/utils/parser.go new file mode 100644 index 0000000..b0ae6f0 --- /dev/null +++ b/utils/parser.go @@ -0,0 +1,238 @@ +// Package utils provides shared helpers for parsing workflows. +package utils + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "reflect" + "strconv" + "sync" + + "strings" + + "github.com/UTDNebula/nebula-api/api/schema" + "google.golang.org/genai" +) + +// Read the text from the first n pages of a PDF +// Using external program pdftotext +// Code requires having pdftotext installed: https://www.xpdfreader.com/pdftotext-man.html +// apt-get install -y poppler-utils +func ReadPdf(path string, lastPage int) (string, error) { + cmd := exec.Command("pdftotext", "-raw", path, "-") + if lastPage > 0 { + cmd = exec.Command("pdftotext", "-l", strconv.Itoa(lastPage), "-raw", path, "-") + } + + var out bytes.Buffer + var stderr bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("failed to run pdftotext: %v (%s)", err, stderr.String()) + } + + return out.String(), nil +} + +// Check cache for a response to the same prompt +func CheckCache(hash string, apiBucket string) (string, error) { + apiUrl, apiKey, apiStorageKey, err := getNebulaKeys() + if err != nil { + return "", err + } + + client := &http.Client{} + + // Make request + req, err := http.NewRequest("GET", apiUrl+"storage/"+apiBucket+"/"+hash, nil) + if err != nil { + return "", err + } + req.Header.Add("x-api-key", apiKey) + req.Header.Add("x-storage-key", apiStorageKey) + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Read the response body + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + var parsedBody schema.APIResponse[schema.ObjectInfo] + err = json.Unmarshal([]byte(body), &parsedBody) + if err != nil { + // If this errors, return ("", nil) to indicate not found + return "", nil + } + + // Fetch object + req, err = http.NewRequest("GET", parsedBody.Data.MediaLink, nil) + if err != nil { + return "", err + } + resp, err = client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Read the response body + body, err = io.ReadAll(resp.Body) + if err != nil { + return "", err + } + + return string(body), nil +} + +// Store Gemini client to only create once +var once sync.Once +var geminiClient *genai.Client + +// Create client only once +// Auth is from GOOGLE_GENAI_USE_VERTEXAI, GOOGLE_CLOUD_PROJECT and GOOGLE_APPLICATION_CREDENTIALS environment variables and service account JSON which is created from GEMINI_SERVICE_ACCOUNT +func GetGeminiClient() *genai.Client { + once.Do(func() { + // Create JSON file + serviceAccount, err := GetEnv("GEMINI_SERVICE_ACCOUNT") + if err != nil { + panic(err) + } + jsonFile, err := GetEnv("GOOGLE_APPLICATION_CREDENTIALS") + if err != nil { + panic(err) + } + err = os.WriteFile(jsonFile, []byte(serviceAccount), 0644) + if err != nil { + panic(err) + } + + // Create client + geminiClient, err = genai.NewClient(context.Background(), + &genai.ClientConfig{ + Project: "api-tools-451421", + Location: "us-central1", + Backend: genai.BackendVertexAI, + }) + if err != nil { + panic(err) + } + }) + return geminiClient +} + +func StructToSchema(t reflect.Type) *genai.Schema { + // Handle pointers + isNullable := false + if t.Kind() == reflect.Ptr { + isNullable = true + t = t.Elem() + } + + var schema *genai.Schema + + switch t.Kind() { + case reflect.Struct: + properties := make(map[string]*genai.Schema) + var required []string + + for i := 0; i < t.NumField(); i++ { + field := t.Field(i) + // Use the JSON tag for the property name + jsonTag := field.Tag.Get("json") + if jsonTag == "" || jsonTag == "-" { + continue + } + // Handle comma-separated tags like "name,omitempty" + name := strings.Split(jsonTag, ",")[0] + + properties[name] = StructToSchema(field.Type) + if field.Type.Kind() != reflect.Ptr { + required = append(required, name) + } + } + + schema = &genai.Schema{ + Type: genai.TypeObject, + Properties: properties, + Required: required, + } + + case reflect.Slice, reflect.Array: + schema = &genai.Schema{ + Type: genai.TypeArray, + Items: StructToSchema(t.Elem()), + } + + case reflect.String: + schema = &genai.Schema{Type: genai.TypeString} + + case reflect.Int, reflect.Int64, reflect.Float64: + schema = &genai.Schema{Type: genai.TypeNumber} + + case reflect.Bool: + schema = &genai.Schema{Type: genai.TypeBoolean} + + default: + schema = &genai.Schema{Type: genai.TypeString} + } + + schema.Nullable = &isNullable + return schema +} + +// Upload AI response to cache +func SetCache(hash string, result string, apiBucket string) error { + apiUrl, apiKey, apiStorageKey, err := getNebulaKeys() + if err != nil { + return err + } + + // Make request + jsonStr := []byte(result) + bodyReader := bytes.NewBuffer(jsonStr) + req, err := http.NewRequest("POST", apiUrl+"storage/"+apiBucket+"/"+hash, bodyReader) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Add("x-api-key", apiKey) + req.Header.Add("x-storage-key", apiStorageKey) + client := &http.Client{} + resp, err := client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + return nil +} + +// Get all the keys to access the Nebula API storage routes +func getNebulaKeys() (string, string, string, error) { + apiUrl, err := GetEnv("NEBULA_API_URL") + if err != nil { + return "", "", "", err + } + apiKey, err := GetEnv("NEBULA_API_KEY") + if err != nil { + return "", "", "", err + } + apiStorageKey, err := GetEnv("NEBULA_API_STORAGE_KEY") + if err != nil { + return "", "", "", err + } + + return apiUrl, apiKey, apiStorageKey, nil +}