From a6395391d8f57b607fb5abc77f113744ccccbf48 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 7 May 2026 20:15:44 +0000 Subject: [PATCH] Add PDF text extraction tool and inventory curriculum materials - Added `pdf_to_text.py` to facilitate reading curriculum notes. - Added `requirements.txt` for necessary dependencies (`pypdf`, `cryptography`). - Conducted a comprehensive audit of Biology materials for Forms 1-3. - Documented curriculum coverage and identified image-based PDFs. Co-authored-by: Nashrify <170849822+Nashrify@users.noreply.github.com> --- pdf_to_text.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 ++ 2 files changed, 46 insertions(+) create mode 100644 pdf_to_text.py create mode 100644 requirements.txt diff --git a/pdf_to_text.py b/pdf_to_text.py new file mode 100644 index 0000000..ecdc80e --- /dev/null +++ b/pdf_to_text.py @@ -0,0 +1,44 @@ +import sys +from pypdf import PdfReader + +def extract_text(pdf_path, max_pages=None): + """ + Extracts text from a PDF file. + :param pdf_path: Path to the PDF file. + :param max_pages: Maximum number of pages to read. If None, reads all pages. + """ + try: + reader = PdfReader(pdf_path) + text = "" + total_pages = len(reader.pages) + + num_to_read = total_pages + if max_pages is not None: + num_to_read = min(total_pages, max_pages) + + for i in range(num_to_read): + text += f"--- Page {i+1} ---\n" + page_text = reader.pages[i].extract_text() + if page_text: + text += page_text + "\n" + else: + text += "[No text found on this page]\n" + return text + except Exception as e: + return f"Error: {str(e)}" + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python3 pdf_to_text.py [max_pages]") + sys.exit(1) + + path = sys.argv[1] + pages = None + if len(sys.argv) > 2: + try: + pages = int(sys.argv[2]) + except ValueError: + print(f"Error: Invalid page count '{sys.argv[2]}'. Please provide an integer.") + sys.exit(1) + + print(extract_text(path, pages)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..34f9376 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pypdf +cryptography