diff --git a/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb b/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb
new file mode 100644
index 0000000..ad81509
--- /dev/null
+++ b/PDF_Compliance_Tool_BATCH_Hocr_+_Redaction.ipynb
@@ -0,0 +1,318 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OY-Tn6A_z-rR"
+ },
+ "source": [
+ "# ๐ PDF HOCR + Redact Preserve layout with HOCR and Tesseract5\n",
+ "This notebook can function in a few ways, depending on how you set the run_ocr and redact_pdf variables: It prompts you to mount Google Drive. You can change input/output paths or just create a \"Input\" & \"Output\" folder in MyDrive to re use.\n",
+ "\n",
+ "Once Mounted it installs all pkgs and dependencies per cell. So if you need OCR but not Redaction, set the **redact_pdf** Variable to \"False\" and you don't install the unnecessary redaction pkgs. Same with Redaction on Text based PDFs, there's no need for OCR: Set to \"False\" and you can simply enter your word list in the \"\", \"\", format and go straight to redaction.\n",
+ "\n",
+ "# ๐จโ๐ป 0\n",
+ "OR DO IT ALL TRUE+TRUE = OCR/Redaction Both."
+ ],
+ "id": "OY-Tn6A_z-rR"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jmD9Cf9az-rc",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "24354244-a697-4163-f9e4-68f8a28a7e29"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Collecting pymupdf\n",
+ " Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)\n",
+ "Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)\n",
+ "\u001b[2K \u001b[90mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m \u001b[32m24.1/24.1 MB\u001b[0m \u001b[31m35.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: pymupdf\n",
+ "Successfully installed pymupdf-1.26.3\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ๐ฆ Install PyMuPDF\n",
+ "!pip install pymupdf"
+ ],
+ "id": "jmD9Cf9az-rc"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jPnq0yTSz-rh",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "af438054-fbf3-439d-b34f-c6c8c7a6281e"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/drive\n"
+ ]
+ }
+ ],
+ "source": [
+ "# ๐ Mount Google Drive\n",
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')"
+ ],
+ "id": "jPnq0yTSz-rh"
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nRSZKgpPz-rj"
+ },
+ "outputs": [],
+ "source": [
+ "# ๐งพ Word List\n",
+ "words_to_redact = [\"1628\", \"High\", \"Cir.\", \"1273\", \"1500 Fawn Run Crossing\", \"Fawn Run Crossing\", \"Fawn Run\",\n",
+ " \"Fawn\", \"Run\", \"crossing\", \"Skyridge\", \"Dallas\", \"Cheyenne\", \"Austin\", \"Norman\", \"Tucson\",\n",
+ " \"Yukon\", \"Cleveland\", \"Tiffaney Norton\", \"Dallas Norton\", \"Christopher\", \"Brianna Jackson\",\n",
+ " \"Brionna Jackson\", \"Brianna\", \"Bri\", \"Jeffry Jackson\", \"Jeffrey Jackson\", \"Caitlin Jackson\",\n",
+ " \"Christi Cornett\", \"Christi Comett\", \"Angela\", \"Thagard\", \"Marilyn\", \"Williams\", \"Eufaula\",\n",
+ " \"Jones\", \"73069\", \"73071\", \"Cory\", \"Lori\", \"Puckett\", \"Virgil\", \"Black\", \"Ortega\", \"Jackson\",\n",
+ " \"Pierce\", \"Troy\", \"Judy\", \"Sean\", \"Bailey\", \"Ferguson\", \"Norton\", \"Whatley\", \"Nedwick\",\n",
+ " \"Douglas\", \"Balkman\", \"Tayra\", \"Christy\", \"jilge\", \"Mary\", \"Abbott\", \"Children's\", \"Christi\",\n",
+ " \"Christy\", \"Jennifer\", \"Shyanne\", \"Tara\", \"Riley\", \"Ryleigh\", \"Caitlyn\", \"Caitlin\", \"B.J.\",\n",
+ " \"B. J.\", \"BJ\", \"Bo\", \"Bobo\", \"Tay\", \"Tiffany\", \"Cornett\", \"Jacobi\", \"Norman\", \"Nonnan\",\n",
+ " \"Dallas C. Norton\", \"Ortega\", \"MINOR CHILD B.J.\", \"bri\", \"Eisenhower\", \"Long Fellow\",\n",
+ " \"Norman North\", \"Angela\",\"Thagard\",\"Tanya\", \"Burcham\", \"Thomas\", \"Keith\", \"john\",\"Hadden\",\n",
+ " \"127\", \"Crestland\", \"Charles\", \"Peters\", \"200\", \"Eufaula\", \"Jones\", \"MR. ORTEGA\"]"
+ ],
+ "id": "nRSZKgpPz-rj"
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# ๐ Optional: OCR with Tesseract 5 + HOCR (accurate overlay, in-place update)\n",
+ "run_ocr = True # ๐ Set to False to disable\n",
+ "\n",
+ "if run_ocr == True:\n",
+ " # Install system dependencies for pdf2image\n",
+ " !apt-get update\n",
+ " !apt-get install -y poppler-utils\n",
+ "\n",
+ " !pip install pytesseract\n",
+ " import pytesseract\n",
+ " !pip install pdf2image\n",
+ " from pdf2image import convert_from_path\n",
+ " from PIL import Image\n",
+ " import os\n",
+ " !pip install PyPDF2\n",
+ " from PyPDF2 import PdfMerger\n",
+ " # !pip install poppler # This is a system dependency, not a pip package"
+ ],
+ "metadata": {
+ "id": "CU6xD7cDJGW7",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "46abb749-98c1-4c15-dfca-a99d51ad722b"
+ },
+ "id": "CU6xD7cDJGW7",
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\r0% [Working]\r \rHit:1 http://archive.ubuntu.com/ubuntu jammy InRelease\n",
+ "\r0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or\r \rGet:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]\n",
+ "\r \rGet:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]\n",
+ "Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]\n",
+ "Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 InRelease [1,581 B]\n",
+ "Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]\n",
+ "Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]\n",
+ "Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease\n",
+ "Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease\n",
+ "Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [4,932 kB]\n",
+ "Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease\n",
+ "Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,148 kB]\n",
+ "Get:13 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,267 kB]\n",
+ "Get:14 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,572 kB]\n",
+ "Get:15 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [5,139 kB]\n",
+ "Get:16 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,461 kB]\n",
+ "Get:17 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64 Packages [1,840 kB]\n",
+ "Get:18 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,762 kB]\n",
+ "Get:19 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,126 kB]\n",
+ "Fetched 33.6 MB in 7s (5,019 kB/s)\n",
+ "Reading package lists... Done\n",
+ "W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)\n",
+ "Reading package lists... Done\n",
+ "Building dependency tree... Done\n",
+ "Reading state information... Done\n",
+ "The following NEW packages will be installed:\n",
+ " poppler-utils\n",
+ "0 upgraded, 1 newly installed, 0 to remove and 36 not upgraded.\n",
+ "Need to get 186 kB of archives.\n",
+ "After this operation, 697 kB of additional disk space will be used.\n",
+ "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]\n",
+ "Fetched 186 kB in 0s (1,180 kB/s)\n",
+ "Selecting previously unselected package poppler-utils.\n",
+ "(Reading database ... 126281 files and directories currently installed.)\n",
+ "Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...\n",
+ "Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...\n",
+ "Setting up poppler-utils (22.02.0-2ubuntu0.8) ...\n",
+ "Processing triggers for man-db (2.10.2-1) ...\n",
+ "Collecting pytesseract\n",
+ " Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)\n",
+ "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.11/dist-packages (from pytesseract) (25.0)\n",
+ "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.11/dist-packages (from pytesseract) (11.2.1)\n",
+ "Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)\n",
+ "Installing collected packages: pytesseract\n",
+ "Successfully installed pytesseract-0.3.13\n",
+ "Collecting pdf2image\n",
+ " Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)\n",
+ "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from pdf2image) (11.2.1)\n",
+ "Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)\n",
+ "Installing collected packages: pdf2image\n",
+ "Successfully installed pdf2image-1.17.0\n",
+ "Collecting PyPDF2\n",
+ " Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)\n",
+ "Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
+ "\u001b[2K \u001b[90mโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25hInstalling collected packages: PyPDF2\n",
+ "Successfully installed PyPDF2-3.0.1\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "input_dir = '/content/drive/MyDrive/Input'\n",
+ "def ocr_pdf_hocr (input_path, output_path):\n",
+ " import pytesseract\n",
+ " temp_img_dir = '/content/ocr_images'\n",
+ " os.makedirs(temp_img_dir, exist_ok=True)\n",
+ " images = convert_from_path(input_path, dpi=300, fmt='jpeg', output_folder=temp_img_dir)\n",
+ " page_paths = []\n",
+ "\n",
+ " for i, img in enumerate(images):\n",
+ " img_path = f'/content/page_{i}.jpeg'\n",
+ " img.save(img_path, 'JPEG')\n",
+ "\n",
+ " # Generate HOCR (layout-aware text layer)\n",
+ " hocr_output = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang='eng', config='hocr')\n",
+ " pdf_out = f'/content/page_hocr_{i}.pdf'\n",
+ " with open(pdf_out, 'wb') as f:\n",
+ " f.write(hocr_output)\n",
+ " page_paths.append(pdf_out)\n",
+ "\n",
+ " merger = PdfMerger()\n",
+ " for p in page_paths:\n",
+ " merger.append(p)\n",
+ " merger.write(output_path)\n",
+ " merger.close()\n",
+ " print(f\"โ
HOCR overlay OCR complete: {output_path}\")\n",
+ "\n",
+ "# ๐ Overwrite files in-place\n",
+ "for file in os.listdir(input_dir):\n",
+ " if file.lower().endswith('.pdf'):\n",
+ " src = os.path.join(input_dir, file)\n",
+ " temp_out = os.path.join('/content', f'ocr_{file}')\n",
+ " try:\n",
+ " ocr_pdf_hocr(src, temp_out)\n",
+ " import shutil\n",
+ " shutil.move(temp_out, src) # ๐ Replace original with OCRโd version\n",
+ " except Exception as e:\n",
+ " print(f\"โ ๏ธ HOCR OCR failed for {file}: {e}\")"
+ ],
+ "metadata": {
+ "id": "CkdAzS0AWV6q"
+ },
+ "id": "CkdAzS0AWV6q",
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vSfDa6_vz-ro"
+ },
+ "outputs": [],
+ "source": [
+ "# ๐งผ Optional Redaction Script\n",
+ "redact_pdf = True ๐ Set to False to disable\n",
+ "\n",
+ "If redact_pdf == True:\n",
+ "import fitz\n",
+ "import os\n",
+ "import re # Import regular expressions - Keep import just in case, though not used in new redact_pdf\n",
+ "\n",
+ "input_dir = \"/content/drive/MyDrive/Input\" # Corrected directory name\n",
+ "output_dir = \"/content/drive/MyDrive/Output\"\n",
+ "os.makedirs(output_dir, exist_ok=True)\n",
+ "\n",
+ "def redact_pdf(input_path, output_path, words):\n",
+ " doc = fitz.open(input_path)\n",
+ " word_set = set(w.lower() for w in words) # Lowercase for case-insensitive match\n",
+ "\n",
+ " for page in doc:\n",
+ " wordlist = page.get_text(\"words\") # list of (x0, y0, x1, y1, \"word\", block_no, line_no, word_no)\n",
+ " for w in wordlist:\n",
+ " if w[4].lower() in word_set:\n",
+ " rect = fitz.Rect(w[:4])\n",
+ " page.draw_rect(rect, fill=(0, 0, 0))\n",
+ " doc.save(output_path, garbage=4, deflate=True)\n",
+ " doc.close()\n",
+ "\n",
+ "\n",
+ "# ๐ Process All PDFs\n",
+ "for file in os.listdir(input_dir):\n",
+ " if file.lower().endswith(\".pdf\"):\n",
+ " input_path = os.path.join(input_dir, file)\n",
+ " output_path = os.path.join(output_dir, f\"redacted_{file}\")\n",
+ " print(f\"Redacting: {file}\")\n",
+ " redact_pdf(input_path, output_path, words_to_redact)\n",
+ " print(f\"Saved to: {output_path}\")\n",
+ "\n",
+ "print(\"โ
Redaction complete.\")"
+ ],
+ "id": "vSfDa6_vz-ro"
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": ""
+ },
+ "colab": {
+ "provenance": [],
+ "include_colab_link": true
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/app/build.gradle b/app/build.gradle
index 3b288dd..d05a6a8 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -9,6 +9,9 @@ android {
versionCode 11
versionName "1.10.20201004"
testInstrumentationRunner 'androidx.test.runner.AndroidJUnitRunner'
+ ndk {
+ abiFilters 'armeabi-v7a', 'arm64-v8a' // Optional: keep app smaller
+ }
}
buildTypes {
release {
@@ -30,7 +33,7 @@ dependencies {
androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0'
implementation project(':scanlibrary')
-
+
implementation 'androidx.appcompat:appcompat:1.2.0'
implementation 'com.google.android.material:material:1.2.1'
implementation 'androidx.legacy:legacy-support-v4:1.0.0'
@@ -40,6 +43,7 @@ dependencies {
implementation 'androidx.lifecycle:lifecycle-extensions:2.2.0'
implementation 'androidx.lifecycle:lifecycle-viewmodel:2.2.0'
+ implementation 'com.google.code.tesseract.android:tess-two:9.1.0'
implementation 'com.google.android.gms:play-services-vision:20.1.2'
implementation 'com.google.firebase:firebase-core:17.5.0'
implementation 'com.google.firebase:firebase-ml-vision:24.1.0'
@@ -48,4 +52,4 @@ dependencies {
}
apply plugin: 'com.google.gms.google-services'
-com.google.gms.googleservices.GoogleServicesPlugin.config.disableVersionCheck = true
\ No newline at end of file
+com.google.gms.googleservices.GoogleServicesPlugin.config.disableVersionCheck = true
diff --git a/app/src/assets b/app/src/assets
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/app/src/assets
@@ -0,0 +1 @@
+
diff --git a/app/src/eng.traineddata b/app/src/eng.traineddata
new file mode 100644
index 0000000..f4744c2
Binary files /dev/null and b/app/src/eng.traineddata differ
diff --git a/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java b/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java
index 8d1acce..9f4c0d5 100644
--- a/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java
+++ b/app/src/main/java/com/babanomania/pdfscanner/OCRActivity.java
@@ -1,154 +1,156 @@
package com.babanomania.pdfscanner;
-import android.content.Context;
-import android.content.Intent;
-import android.graphics.Bitmap;
-import android.graphics.pdf.PdfRenderer;
-import android.os.AsyncTask;
-import android.os.Environment;
-import android.os.ParcelFileDescriptor;
-import android.os.Bundle;
-import android.util.Log;
-import android.view.View;
-import android.widget.Button;
-import android.widget.EditText;
-import android.widget.ProgressBar;
-import android.widget.RelativeLayout;
-
-import androidx.appcompat.app.AppCompatActivity;
-
-import com.babanomania.pdfscanner.utils.OCRUtils;
-import com.babanomania.pdfscanner.utils.UIUtil;
-
-import java.io.File;
-import java.util.ArrayList;
+import android.content.Context; import android.content.DialogInterface; import android.content.Intent; import android.graphics.Bitmap; import android.graphics.pdf.PdfRenderer; import android.os.AsyncTask; import android.os.Bundle; import android.os.Environment; import android.os.ParcelFileDescriptor; import android.util.Log; import android.view.View; import android.widget.Button; import android.widget.EditText; import android.widget.ProgressBar; import android.widget.RelativeLayout;
-public class OCRActivity extends AppCompatActivity {
-
- public EditText ocrText;
- public Button shareButton;
- private ProgressBar progressBar;
- public static String FILE_PATH = "file_path";
-
- @Override
- protected void onCreate(Bundle savedInstanceState) {
- super.onCreate(savedInstanceState);
- setContentView(R.layout.activity_ocr);
-
- RelativeLayout relativeLayout = findViewById(R.id.rl);
- UIUtil.setLightNavigationBar( relativeLayout, this );
-
- this.ocrText = findViewById(R.id.ocrText);
- this.shareButton = findViewById(R.id.shareBtn);
- this.progressBar = findViewById(R.id.extractingProgress);
-
- this.ocrText.setText( getResources().getString(R.string.ocr_waiting_text) );
- setTitle( getResources().getString(R.string.ocr_title) );
-
- this.progressBar.setVisibility(View.VISIBLE);
- this.shareButton.setVisibility(View.GONE);
- this.shareButton.setOnClickListener(
- new View.OnClickListener() {
- @Override
- public void onClick(View v) {
-
- String textToShare = ocrText.getText().toString();
- Intent sharingIntent = new Intent(android.content.Intent.ACTION_SEND);
- sharingIntent.setType("text/plain");
- sharingIntent.putExtra(android.content.Intent.EXTRA_TEXT, textToShare);
- startActivity(Intent.createChooser(sharingIntent, "Share Using"));
-
- }
- }
- );
-
- Bundle bundle = getIntent().getExtras();
- final String filePath = bundle.getString(FILE_PATH);
- new OCRExtractTask( this, getApplicationContext(), filePath )
- .execute();
- }
-
- public void setText( String content ){
- this.ocrText.setText( content );
- }
-
- private class OCRExtractTask extends AsyncTask {
-
- private OCRActivity ocrActivity;
- private Context context;
- private String filePath;
-
- public OCRExtractTask( OCRActivity ocrActivity, Context context, String filePath ){
- this.ocrActivity = ocrActivity;
- this.context = context;
- this.filePath = filePath;
- }
+import androidx.appcompat.app.AlertDialog; import androidx.appcompat.app.AppCompatActivity;
- @Override
- protected String doInBackground(String... strings) {
- try {
+import com.babanomania.pdfscanner.utils.OCRUtils; import com.babanomania.pdfscanner.utils.PDFUtils; import com.babanomania.pdfscanner.utils.UIUtil;
- ArrayList bitmaps = new ArrayList<>();
- final String baseDirectory = context.getString(R.string.base_storage_path);
- final File sd = Environment.getExternalStorageDirectory();
+import java.io.File; import java.util.ArrayList; import java.util.List;
- File toOcr = new File(sd, baseDirectory + this.filePath);
+public class OCRActivity extends AppCompatActivity {
- PdfRenderer renderer = new PdfRenderer(ParcelFileDescriptor.open(toOcr, ParcelFileDescriptor.MODE_READ_ONLY));
+public EditText ocrText;
+public Button shareButton;
+private ProgressBar progressBar;
+public static String FILE_PATH = "file_path";
+
+@Override
+protected void onCreate(Bundle savedInstanceState) {
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.activity_ocr);
+
+ RelativeLayout relativeLayout = findViewById(R.id.rl);
+ UIUtil.setLightNavigationBar(relativeLayout, this);
+
+ this.ocrText = findViewById(R.id.ocrText);
+ this.shareButton = findViewById(R.id.shareBtn);
+ this.progressBar = findViewById(R.id.extractingProgress);
+
+ this.ocrText.setText(getResources().getString(R.string.ocr_waiting_text));
+ setTitle(getResources().getString(R.string.ocr_title));
+
+ this.progressBar.setVisibility(View.VISIBLE);
+ this.shareButton.setVisibility(View.GONE);
+ this.shareButton.setOnClickListener(v -> {
+ String textToShare = ocrText.getText().toString();
+ Intent sharingIntent = new Intent(Intent.ACTION_SEND);
+ sharingIntent.setType("text/plain");
+ sharingIntent.putExtra(Intent.EXTRA_TEXT, textToShare);
+ startActivity(Intent.createChooser(sharingIntent, "Share Using"));
+ });
+
+ Bundle bundle = getIntent().getExtras();
+ final String filePath = bundle.getString(FILE_PATH);
+ showRedactionChoiceDialog(filePath);
+}
- Bitmap bitmap;
- final int pageCount = renderer.getPageCount();
- for (int i = 0; i < pageCount; i++) {
- PdfRenderer.Page page = renderer.openPage(i);
+private void showRedactionChoiceDialog(final String filePath) {
+ final CharSequence[] options = {"HIPAA Guidelines", "Custom Word List", "No Redaction"};
+
+ new AlertDialog.Builder(this)
+ .setTitle("Select Redaction Mode")
+ .setItems(options, (dialog, which) -> {
+ boolean useHIPAA = false;
+ boolean applyRedaction = true;
+
+ switch (which) {
+ case 0: // HIPAA
+ useHIPAA = true;
+ break;
+ case 1: // Custom list
+ useHIPAA = false;
+ break;
+ case 2: // No redaction
+ applyRedaction = false;
+ break;
+ }
- int width = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getWidth();
- int height = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getHeight();
- bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888);
+ new OCRExtractTask(OCRActivity.this, getApplicationContext(), filePath, applyRedaction, useHIPAA).execute();
+ })
+ .setCancelable(false)
+ .show();
+}
- page.render(bitmap, null, null, PdfRenderer.Page.RENDER_MODE_FOR_DISPLAY);
+public void setText(String content) {
+ this.ocrText.setText(content);
+}
- bitmaps.add(bitmap);
+private class OCRExtractTask extends AsyncTask {
+ private OCRActivity ocrActivity;
+ private Context context;
+ private String filePath;
+ private boolean applyRedaction;
+ private boolean redactWithHIPAA;
+
+ public OCRExtractTask(OCRActivity ocrActivity, Context context, String filePath, boolean applyRedaction, boolean redactWithHIPAA) {
+ this.ocrActivity = ocrActivity;
+ this.context = context;
+ this.filePath = filePath;
+ this.applyRedaction = applyRedaction;
+ this.redactWithHIPAA = redactWithHIPAA;
+ }
- // close the page
- page.close();
+ @Override
+ protected String doInBackground(String... strings) {
+ try {
+ ArrayList bitmaps = new ArrayList<>();
+ final String baseDirectory = context.getString(R.string.base_storage_path);
+ final File sd = Environment.getExternalStorageDirectory();
+ File toOcr = new File(sd, baseDirectory + this.filePath);
+
+ PdfRenderer renderer = new PdfRenderer(ParcelFileDescriptor.open(toOcr, ParcelFileDescriptor.MODE_READ_ONLY));
+ final int pageCount = renderer.getPageCount();
+
+ for (int i = 0; i < pageCount; i++) {
+ PdfRenderer.Page page = renderer.openPage(i);
+ int width = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getWidth();
+ int height = context.getResources().getDisplayMetrics().densityDpi / 72 * page.getHeight();
+ Bitmap bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888);
+ page.render(bitmap, null, null, PdfRenderer.Page.RENDER_MODE_FOR_DISPLAY);
+ bitmaps.add(bitmap);
+ page.close();
+ }
+ renderer.close();
+ StringBuilder extractedText = new StringBuilder();
+ for (Bitmap eachPage : bitmaps) {
+ String pageText = OCRUtils.getTextFromBitmap(context, eachPage);
+ if (applyRedaction) {
+ pageText = applyRedactions(pageText, redactWithHIPAA);
}
+ extractedText.append(pageText).append("\n\n");
+ }
- // close the renderer
- renderer.close();
-
- StringBuffer extractedText = new StringBuffer();
- for (Bitmap eachPage : bitmaps ) {
- extractedText.append(
- OCRUtils.getTextFromBitmap(context, eachPage)
- );
-
- }
+ String finalText = extractedText.toString();
+ PDFUtils.writeSearchablePDF(toOcr.getName(), finalText, context);
+
+ runOnUiThread(() -> {
+ ocrActivity.setText(finalText);
+ shareButton.setVisibility(View.VISIBLE);
+ progressBar.setVisibility(View.GONE);
+ });
+
+ } catch (Exception e) {
+ Log.e("Clean Scan", "Unable to extract text", e);
+ runOnUiThread(() -> {
+ ocrActivity.setText(getResources().getString(R.string.ocr_failed_text));
+ shareButton.setVisibility(View.GONE);
+ progressBar.setVisibility(View.GONE);
+ });
+ }
+ return null;
+ }
- Log.d( "Clean Scan", "detected text : " + extractedText );
- this.ocrActivity.setText(extractedText.toString() );
- runOnUiThread(new Runnable() {
- @Override
- public void run() {
- shareButton.setVisibility(View.VISIBLE);
- progressBar.setVisibility(View.GONE);
- }
- });
-
- }catch (Exception e){
- Log.e( "Clean Scan", "Unable to extract text", e );
- this.ocrActivity.setText( getResources().getString(R.string.ocr_failed_text) );
- runOnUiThread(new Runnable() {
- @Override
- public void run() {
- shareButton.setVisibility(View.GONE);
- progressBar.setVisibility(View.GONE);
- }
- });
-
- } finally {
- return null;
- }
+ private String applyRedactions(String text, boolean useHIPAA) {
+ List redactionList = useHIPAA ? OCRUtils.getHIPAAWordList(context) : OCRUtils.getCustomWordList(context);
+ for (String term : redactionList) {
+ text = text.replaceAll("(?i)\\b" + term + "\\b", "[REDACTED]");
}
+ return text;
}
}
+
+}
+
+