diff --git a/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb b/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb new file mode 100644 index 0000000..ad81509 --- /dev/null +++ b/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb @@ -0,0 +1,318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OY-Tn6A_z-rR" + }, + "source": [ + "# ๐Ÿ”’ PDF HOCR + Redact Preserve layout with HOCR and Tesseract5\n", + "This notebook can function in a few ways, depending on how you set the run_ocr and redact_pdf variables: It prompts you to mount Google Drive. You can change input/output paths or just create a \"Input\" & \"Output\" folder in MyDrive to re use.\n", + "\n", + "Once Mounted it installs all pkgs and dependencies per cell. So if you need OCR but not Redaction, set the **redact_pdf** Variable to \"False\" and you don't install the unnecessary redaction pkgs. Same with Redaction on Text based PDFs, there's no need for OCR: Set to \"False\" and you can simply enter your word list in the \"\", \"\", format and go straight to redaction.\n", + "\n", + "# ๐Ÿ‘จโ€๐Ÿ’ป 0\n", + "OR DO IT ALL TRUE+TRUE = OCR/Redaction Both." + ], + "id": "OY-Tn6A_z-rR" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jmD9Cf9az-rc", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "24354244-a697-4163-f9e4-68f8a28a7e29" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting pymupdf\n", + " Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)\n", + "Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m24.1/24.1 MB\u001b[0m \u001b[31m35.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pymupdf\n", + "Successfully installed pymupdf-1.26.3\n" + ] + } + ], + "source": [ + "# ๐Ÿ“ฆ Install PyMuPDF\n", + "!pip install pymupdf" + ], + "id": "jmD9Cf9az-rc" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jPnq0yTSz-rh", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "af438054-fbf3-439d-b34f-c6c8c7a6281e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "# ๐Ÿ“‚ Mount Google Drive\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "id": "jPnq0yTSz-rh" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nRSZKgpPz-rj" + }, + "outputs": [], + "source": [ + "# ๐Ÿงพ Word List\n", + "words_to_redact = [\"1628\", \"High\", \"Cir.\", \"1273\", \"1500 Fawn Run Crossing\", \"Fawn Run Crossing\", \"Fawn Run\",\n", + " \"Fawn\", \"Run\", \"crossing\", \"Skyridge\", \"Dallas\", \"Cheyenne\", \"Austin\", \"Norman\", \"Tucson\",\n", + " \"Yukon\", \"Cleveland\", \"Tiffaney Norton\", \"Dallas Norton\", \"Christopher\", \"Brianna Jackson\",\n", + " \"Brionna Jackson\", \"Brianna\", \"Bri\", \"Jeffry Jackson\", \"Jeffrey Jackson\", \"Caitlin Jackson\",\n", + " \"Christi Cornett\", \"Christi Comett\", \"Angela\", \"Thagard\", \"Marilyn\", \"Williams\", \"Eufaula\",\n", + " \"Jones\", \"73069\", \"73071\", \"Cory\", \"Lori\", \"Puckett\", \"Virgil\", \"Black\", \"Ortega\", \"Jackson\",\n", + " \"Pierce\", \"Troy\", \"Judy\", \"Sean\", \"Bailey\", \"Ferguson\", \"Norton\", \"Whatley\", \"Nedwick\",\n", + " \"Douglas\", \"Balkman\", \"Tayra\", \"Christy\", \"jilge\", \"Mary\", \"Abbott\", \"Children's\", \"Christi\",\n", + " \"Christy\", \"Jennifer\", \"Shyanne\", \"Tara\", \"Riley\", \"Ryleigh\", \"Caitlyn\", \"Caitlin\", \"B.J.\",\n", + " \"B. J.\", \"BJ\", \"Bo\", \"Bobo\", \"Tay\", \"Tiffany\", \"Cornett\", \"Jacobi\", \"Norman\", \"Nonnan\",\n", + " \"Dallas C. Norton\", \"Ortega\", \"MINOR CHILD B.J.\", \"bri\", \"Eisenhower\", \"Long Fellow\",\n", + " \"Norman North\", \"Angela\",\"Thagard\",\"Tanya\", \"Burcham\", \"Thomas\", \"Keith\", \"john\",\"Hadden\",\n", + " \"127\", \"Crestland\", \"Charles\", \"Peters\", \"200\", \"Eufaula\", \"Jones\", \"MR. ORTEGA\"]" + ], + "id": "nRSZKgpPz-rj" + }, + { + "cell_type": "code", + "source": [ + "# ๐Ÿ“Œ Optional: OCR with Tesseract 5 + HOCR (accurate overlay, in-place update)\n", + "run_ocr = True # ๐Ÿ” Set to False to disable\n", + "\n", + "if run_ocr == True:\n", + " # Install system dependencies for pdf2image\n", + " !apt-get update\n", + " !apt-get install -y poppler-utils\n", + "\n", + " !pip install pytesseract\n", + " import pytesseract\n", + " !pip install pdf2image\n", + " from pdf2image import convert_from_path\n", + " from PIL import Image\n", + " import os\n", + " !pip install PyPDF2\n", + " from PyPDF2 import PdfMerger\n", + " # !pip install poppler # This is a system dependency, not a pip package" + ], + "metadata": { + "id": "CU6xD7cDJGW7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "46abb749-98c1-4c15-dfca-a99d51ad722b" + }, + "id": "CU6xD7cDJGW7", + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\r0% [Working]\r \rHit:1 http://archive.ubuntu.com/ubuntu jammy InRelease\n", + "\r0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or\r \rGet:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n", + "\r \rGet:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n", + "Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n", + "Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease [1,581 B]\n", + "Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n", + "Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n", + "Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n", + "Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n", + "Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4,932 kB]\n", + "Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n", + "Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,148 kB]\n", + "Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,267 kB]\n", + "Get:14 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,572 kB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [5,139 kB]\n", + "Get:16 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,461 kB]\n", + "Get:17 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 Packages [1,840 kB]\n", + "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,762 kB]\n", + "Get:19 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,126 kB]\n", + "Fetched 33.6 MB in 7s (5,019 kB/s)\n", + "Reading package lists... Done\n", + "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n", + "Reading package lists... Done\n", + "Building dependency tree... Done\n", + "Reading state information... Done\n", + "The following NEW packages will be installed:\n", + " poppler-utils\n", + "0 upgraded, 1 newly installed, 0 to remove and 36 not upgraded.\n", + "Need to get 186 kB of archives.\n", + "After this operation, 697 kB of additional disk space will be used.\n", + "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]\n", + "Fetched 186 kB in 0s (1,180 kB/s)\n", + "Selecting previously unselected package poppler-utils.\n", + "(Reading database ... 126281 files and directories currently installed.)\n", + "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...\n", + "Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...\n", + "Setting up poppler-utils (22.02.0-2ubuntu0.8) ...\n", + "Processing triggers for man-db (2.10.2-1) ...\n", + "Collecting pytesseract\n", + " Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n", + "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.11/dist-packages (from pytesseract) (25.0)\n", + "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from pytesseract) (11.2.1)\n", + "Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n", + "Installing collected packages: pytesseract\n", + "Successfully installed pytesseract-0.3.13\n", + "Collecting pdf2image\n", + " Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n", + "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from pdf2image) (11.2.1)\n", + "Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n", + "Installing collected packages: pdf2image\n", + "Successfully installed pdf2image-1.17.0\n", + "Collecting PyPDF2\n", + " Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)\n", + "Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n", + "\u001b[2K \u001b[90mโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: PyPDF2\n", + "Successfully installed PyPDF2-3.0.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "input_dir = '/content/drive/MyDrive/Input'\n", + "def ocr_pdf_hocr (input_path, output_path):\n", + " import pytesseract\n", + " temp_img_dir = '/content/ocr_images'\n", + " os.makedirs(temp_img_dir, exist_ok=True)\n", + " images = convert_from_path(input_path, dpi=300, fmt='jpeg', output_folder=temp_img_dir)\n", + " page_paths = []\n", + "\n", + " for i, img in enumerate(images):\n", + " img_path = f'/content/page_{i}.jpeg'\n", + " img.save(img_path, 'JPEG')\n", + "\n", + " # Generate HOCR (layout-aware text layer)\n", + " hocr_output = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang='eng', config='hocr')\n", + " pdf_out = f'/content/page_hocr_{i}.pdf'\n", + " with open(pdf_out, 'wb') as f:\n", + " f.write(hocr_output)\n", + " page_paths.append(pdf_out)\n", + "\n", + " merger = PdfMerger()\n", + " for p in page_paths:\n", + " merger.append(p)\n", + " merger.write(output_path)\n", + " merger.close()\n", + " print(f\"โœ… HOCR overlay OCR complete: {output_path}\")\n", + "\n", + "# ๐Ÿ” Overwrite files in-place\n", + "for file in os.listdir(input_dir):\n", + " if file.lower().endswith('.pdf'):\n", + " src = os.path.join(input_dir, file)\n", + " temp_out = os.path.join('/content', f'ocr_{file}')\n", + " try:\n", + " ocr_pdf_hocr(src, temp_out)\n", + " import shutil\n", + " shutil.move(temp_out, src) # ๐Ÿ” Replace original with OCRโ€™d version\n", + " except Exception as e:\n", + " print(f\"โš ๏ธ HOCR OCR failed for {file}: {e}\")" + ], + "metadata": { + "id": "CkdAzS0AWV6q" + }, + "id": "CkdAzS0AWV6q", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vSfDa6_vz-ro" + }, + "outputs": [], + "source": [ + "# ๐Ÿงผ Optional Redaction Script\n", + "redact_pdf = True ๐Ÿ” Set to False to disable\n", + "\n", + "If redact_pdf == True:\n", + "import fitz\n", + "import os\n", + "import re # Import regular expressions - Keep import just in case, though not used in new redact_pdf\n", + "\n", + "input_dir = \"/content/drive/MyDrive/Input\" # Corrected directory name\n", + "output_dir = \"/content/drive/MyDrive/Output\"\n", + "os.makedirs(output_dir, exist_ok=True)\n", + "\n", + "def redact_pdf(input_path, output_path, words):\n", + " doc = fitz.open(input_path)\n", + " word_set = set(w.lower() for w in words) # Lowercase for case-insensitive match\n", + "\n", + " for page in doc:\n", + " wordlist = page.get_text(\"words\") # list of (x0, y0, x1, y1, \"word\", block_no, line_no, word_no)\n", + " for w in wordlist:\n", + " if w[4].lower() in word_set:\n", + " rect = fitz.Rect(w[:4])\n", + " page.draw_rect(rect, fill=(0, 0, 0))\n", + " doc.save(output_path, garbage=4, deflate=True)\n", + " doc.close()\n", + "\n", + "\n", + "# ๐Ÿ”„ Process All PDFs\n", + "for file in os.listdir(input_dir):\n", + " if file.lower().endswith(\".pdf\"):\n", + " input_path = os.path.join(input_dir, file)\n", + " output_path = os.path.join(output_dir, f\"redacted_{file}\")\n", + " print(f\"Redacting: {file}\")\n", + " redact_pdf(input_path, output_path, words_to_redact)\n", + " print(f\"Saved to: {output_path}\")\n", + "\n", + "print(\"โœ… Redaction complete.\")" + ], + "id": "vSfDa6_vz-ro" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "" + }, + "colab": { + "provenance": [], + "include_colab_link": true + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/app/build.gradle b/app/build.gradle index 3b288dd..d05a6a8 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -9,6 +9,9 @@ android { versionCode 11 versionName "1.10.20201004" testInstrumentationRunner 'androidx.test.runner.AndroidJUnitRunner' + ndk { + abiFilters 'armeabi-v7a', 'arm64-v8a' // Optional: keep app smaller + } } buildTypes { release { @@ -30,7 +33,7 @@ dependencies { androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' implementation project(':scanlibrary') - + implementation 'androidx.appcompat:appcompat:1.2.0' implementation 'com.google.android.material:material:1.2.1' implementation 'androidx.legacy:legacy-support-v4:1.0.0' @@ -40,6 +43,7 @@ dependencies { implementation 'androidx.lifecycle:lifecycle-extensions:2.2.0' implementation 'androidx.lifecycle:lifecycle-viewmodel:2.2.0' + implementation 'com.google.code.tesseract.android:tess-two:9.1.0' implementation 'com.google.android.gms:play-services-vision:20.1.2' implementation 'com.google.firebase:firebase-core:17.5.0' implementation 'com.google.firebase:firebase-ml-vision:24.1.0' @@ -48,4 +52,4 @@ dependencies { } apply plugin: 'com.google.gms.google-services' -com.google.gms.googleservices.GoogleServicesPlugin.config.disableVersionCheck = true \ No newline at end of file +com.google.gms.googleservices.GoogleServicesPlugin.config.disableVersionCheck = true diff --git a/app/src/assets b/app/src/assets new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/app/src/assets @@ -0,0 +1 @@ + diff --git a/app/src/eng.traineddata b/app/src/eng.traineddata new file mode 100644 index 0000000..f4744c2 Binary files /dev/null and b/app/src/eng.traineddata differ diff --git a/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java b/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java index 8d1acce..9f4c0d5 100644 --- a/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java +++ b/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java @@ -1,154 +1,156 @@ package com.babanomania.pdfscanner; -import android.content.Context; -import android.content.Intent; -import android.graphics.Bitmap; -import android.graphics.pdf.PdfRenderer; -import android.os.AsyncTask; -import android.os.Environment; -import android.os.ParcelFileDescriptor; -import android.os.Bundle; -import android.util.Log; -import android.view.View; -import android.widget.Button; -import android.widget.EditText; -import android.widget.ProgressBar; -import android.widget.RelativeLayout; - -import androidx.appcompat.app.AppCompatActivity; - -import com.babanomania.pdfscanner.utils.OCRUtils; -import com.babanomania.pdfscanner.utils.UIUtil; - -import java.io.File; -import java.util.ArrayList; +import android.content.Context; import android.content.DialogInterface; import android.content.Intent; import android.graphics.Bitmap; import android.graphics.pdf.PdfRenderer; import android.os.AsyncTask; import android.os.Bundle; import android.os.Environment; import android.os.ParcelFileDescriptor; import android.util.Log; import android.view.View; import android.widget.Button; import android.widget.EditText; import android.widget.ProgressBar; import android.widget.RelativeLayout; -public class OCRActivity extends AppCompatActivity { - - public EditText ocrText; - public Button shareButton; - private ProgressBar progressBar; - public static String FILE_PATH = "file_path"; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - setContentView(R.layout.activity_ocr); - - RelativeLayout relativeLayout = findViewById(R.id.rl); - UIUtil.setLightNavigationBar( relativeLayout, this ); - - this.ocrText = findViewById(R.id.ocrText); - this.shareButton = findViewById(R.id.shareBtn); - this.progressBar = findViewById(R.id.extractingProgress); - - this.ocrText.setText( getResources().getString(R.string.ocr_waiting_text) ); - setTitle( getResources().getString(R.string.ocr_title) ); - - this.progressBar.setVisibility(View.VISIBLE); - this.shareButton.setVisibility(View.GONE); - this.shareButton.setOnClickListener( - new View.OnClickListener() { - @Override - public void onClick(View v) { - - String textToShare = ocrText.getText().toString(); - Intent sharingIntent = new Intent(android.content.Intent.ACTION_SEND); - sharingIntent.setType("text/plain"); - sharingIntent.putExtra(android.content.Intent.EXTRA_TEXT, textToShare); - startActivity(Intent.createChooser(sharingIntent, "Share Using")); - - } - } - ); - - Bundle bundle = getIntent().getExtras(); - final String filePath = bundle.getString(FILE_PATH); - new OCRExtractTask( this, getApplicationContext(), filePath ) - .execute(); - } - - public void setText( String content ){ - this.ocrText.setText( content ); - } - - private class OCRExtractTask extends AsyncTask { - - private OCRActivity ocrActivity; - private Context context; - private String filePath; - - public OCRExtractTask( OCRActivity ocrActivity, Context context, String filePath ){ - this.ocrActivity = ocrActivity; - this.context = context; - this.filePath = filePath; - } +import androidx.appcompat.app.AlertDialog; import androidx.appcompat.app.AppCompatActivity; - @Override - protected String doInBackground(String... strings) { - try { +import com.babanomania.pdfscanner.utils.OCRUtils; import com.babanomania.pdfscanner.utils.PDFUtils; import com.babanomania.pdfscanner.utils.UIUtil; - ArrayList bitmaps = new ArrayList<>(); - final String baseDirectory = context.getString(R.string.base_storage_path); - final File sd = Environment.getExternalStorageDirectory(); +import java.io.File; import java.util.ArrayList; import java.util.List; - File toOcr = new File(sd, baseDirectory + this.filePath); +public class OCRActivity extends AppCompatActivity { - PdfRenderer renderer = new PdfRenderer(ParcelFileDescriptor.open(toOcr, ParcelFileDescriptor.MODE_READ_ONLY)); +public EditText ocrText; +public Button shareButton; +private ProgressBar progressBar; +public static String FILE_PATH = "file_path"; + +@Override +protected void onCreate(Bundle savedInstanceState) { + super.onCreate(savedInstanceState); + setContentView(R.layout.activity_ocr); + + RelativeLayout relativeLayout = findViewById(R.id.rl); + UIUtil.setLightNavigationBar(relativeLayout, this); + + this.ocrText = findViewById(R.id.ocrText); + this.shareButton = findViewById(R.id.shareBtn); + this.progressBar = findViewById(R.id.extractingProgress); + + this.ocrText.setText(getResources().getString(R.string.ocr_waiting_text)); + setTitle(getResources().getString(R.string.ocr_title)); + + this.progressBar.setVisibility(View.VISIBLE); + this.shareButton.setVisibility(View.GONE); + this.shareButton.setOnClickListener(v -> { + String textToShare = ocrText.getText().toString(); + Intent sharingIntent = new Intent(Intent.ACTION_SEND); + sharingIntent.setType("text/plain"); + sharingIntent.putExtra(Intent.EXTRA_TEXT, textToShare); + startActivity(Intent.createChooser(sharingIntent, "Share Using")); + }); + + Bundle bundle = getIntent().getExtras(); + final String filePath = bundle.getString(FILE_PATH); + showRedactionChoiceDialog(filePath); +} - Bitmap bitmap; - final int pageCount = renderer.getPageCount(); - for (int i = 0; i < pageCount; i++) { - PdfRenderer.Page page = renderer.openPage(i); +private void showRedactionChoiceDialog(final String filePath) { + final CharSequence[] options = {"HIPAA Guidelines", "Custom Word List", "No Redaction"}; + + new AlertDialog.Builder(this) + .setTitle("Select Redaction Mode") + .setItems(options, (dialog, which) -> { + boolean useHIPAA = false; + boolean applyRedaction = true; + + switch (which) { + case 0: // HIPAA + useHIPAA = true; + break; + case 1: // Custom list + useHIPAA = false; + break; + case 2: // No redaction + applyRedaction = false; + break; + } - int width = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getWidth(); - int height = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getHeight(); - bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888); + new OCRExtractTask(OCRActivity.this, getApplicationContext(), filePath, applyRedaction, useHIPAA).execute(); + }) + .setCancelable(false) + .show(); +} - page.render(bitmap, null, null, PdfRenderer.Page.RENDER_MODE_FOR_DISPLAY); +public void setText(String content) { + this.ocrText.setText(content); +} - bitmaps.add(bitmap); +private class OCRExtractTask extends AsyncTask { + private OCRActivity ocrActivity; + private Context context; + private String filePath; + private boolean applyRedaction; + private boolean redactWithHIPAA; + + public OCRExtractTask(OCRActivity ocrActivity, Context context, String filePath, boolean applyRedaction, boolean redactWithHIPAA) { + this.ocrActivity = ocrActivity; + this.context = context; + this.filePath = filePath; + this.applyRedaction = applyRedaction; + this.redactWithHIPAA = redactWithHIPAA; + } - // close the page - page.close(); + @Override + protected String doInBackground(String... strings) { + try { + ArrayList bitmaps = new ArrayList<>(); + final String baseDirectory = context.getString(R.string.base_storage_path); + final File sd = Environment.getExternalStorageDirectory(); + File toOcr = new File(sd, baseDirectory + this.filePath); + + PdfRenderer renderer = new PdfRenderer(ParcelFileDescriptor.open(toOcr, ParcelFileDescriptor.MODE_READ_ONLY)); + final int pageCount = renderer.getPageCount(); + + for (int i = 0; i < pageCount; i++) { + PdfRenderer.Page page = renderer.openPage(i); + int width = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getWidth(); + int height = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getHeight(); + Bitmap bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888); + page.render(bitmap, null, null, PdfRenderer.Page.RENDER_MODE_FOR_DISPLAY); + bitmaps.add(bitmap); + page.close(); + } + renderer.close(); + StringBuilder extractedText = new StringBuilder(); + for (Bitmap eachPage : bitmaps) { + String pageText = OCRUtils.getTextFromBitmap(context, eachPage); + if (applyRedaction) { + pageText = applyRedactions(pageText, redactWithHIPAA); } + extractedText.append(pageText).append("\n\n"); + } - // close the renderer - renderer.close(); - - StringBuffer extractedText = new StringBuffer(); - for (Bitmap eachPage : bitmaps ) { - extractedText.append( - OCRUtils.getTextFromBitmap(context, eachPage) - ); - - } + String finalText = extractedText.toString(); + PDFUtils.writeSearchablePDF(toOcr.getName(), finalText, context); + + runOnUiThread(() -> { + ocrActivity.setText(finalText); + shareButton.setVisibility(View.VISIBLE); + progressBar.setVisibility(View.GONE); + }); + + } catch (Exception e) { + Log.e("Clean Scan", "Unable to extract text", e); + runOnUiThread(() -> { + ocrActivity.setText(getResources().getString(R.string.ocr_failed_text)); + shareButton.setVisibility(View.GONE); + progressBar.setVisibility(View.GONE); + }); + } + return null; + } - Log.d( "Clean Scan", "detected text : " + extractedText ); - this.ocrActivity.setText(extractedText.toString() ); - runOnUiThread(new Runnable() { - @Override - public void run() { - shareButton.setVisibility(View.VISIBLE); - progressBar.setVisibility(View.GONE); - } - }); - - }catch (Exception e){ - Log.e( "Clean Scan", "Unable to extract text", e ); - this.ocrActivity.setText( getResources().getString(R.string.ocr_failed_text) ); - runOnUiThread(new Runnable() { - @Override - public void run() { - shareButton.setVisibility(View.GONE); - progressBar.setVisibility(View.GONE); - } - }); - - } finally { - return null; - } + private String applyRedactions(String text, boolean useHIPAA) { + List redactionList = useHIPAA ? OCRUtils.getHIPAAWordList(context) : OCRUtils.getCustomWordList(context); + for (String term : redactionList) { + text = text.replaceAll("(?i)\\b" + term + "\\b", "[REDACTED]"); } + return text; } } + +} + +