dimagi · kaapstorm · May 13, 2026 · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/commcare_export/env.py b/commcare_export/env.py
@@ -16,7 +16,7 @@
 
 logger = logging.getLogger(__name__)
 
-JSONPATH_CACHE = {}
+JSONPATH_CACHE: dict[str, Any] = {}
 
 
 class CannotBind(Exception):

diff --git a/commcare_export/excel_query.py b/commcare_export/excel_query.py
@@ -696,7 +696,7 @@ def check_columns(parsed_sheets, columns):
         raise MissingColumnException(errors_by_sheet)
 
 
-blacklisted_tables = []
+blacklisted_tables: list[str] = []
 
 
 def blacklist(table_name):

diff --git a/commcare_export/writers.py b/commcare_export/writers.py
@@ -3,6 +3,7 @@
 import logging
 from tempfile import NamedTemporaryFile
 import zipfile
+import itertools
 from itertools import zip_longest
 from typing import Optional
 
@@ -16,6 +17,8 @@
 
 logger = logging.getLogger(__name__)
 MAX_COLUMN_SIZE = 2000
+SCHEMA_CHECK_ROWS = 10
+BATCH_SIZE = 1000
 
 
 def ensure_text(v, convert_none=False):
@@ -605,25 +608,111 @@ def upsert(self, table, row_dict):
             )
             self.connection.execute(update)
 
+    def _commit(self):
+        # Explicit commit works for all DB types. Replace with explicit
+        # transactions when upgrading to SQLAlchemy 2.0
+        self.connection.execute(sqlalchemy.text('COMMIT'))
+
+    def bulk_upsert(self, table, batch):
+        if not batch:
+            return
+        # SQLAlchemy requires all dicts in `batch` to have the same keys
+        # for `insert(table).values(batch)`. We need to drop the columns
+        # whose values are always `None` to reproduce the behavior of
+        # `SqlTableWriter.insert()`. `batch_keys` are the columns where
+        # _any_ row has a value set.
+        batch_keys = set()
+        for row_dict in batch:
+            for key, value in row_dict.items():
+                if value is not None:
+                    batch_keys.add(key)
+        batch = [{k: row_dict[k] for k in batch_keys} for row_dict in batch]
+        if self.is_postgres:
+            from sqlalchemy.dialects.postgresql import insert
+
+            stmt = insert(table).values(batch)
+            new_row = stmt.excluded
+        elif self.is_mysql:
+            from sqlalchemy.dialects.mysql import insert
+
+            stmt = insert(table).values(batch)
+            new_row = stmt.inserted
+        else:
+            # MSSQL and others: fall back to row-by-row
+            for row_dict in batch:
+                self.upsert(table, row_dict)
+            return
+
+        # Use COALESCE so that a None in the inserted row preserves the
+        # existing column value, matching the per-row upsert() which
+        # strips Nones before building the UPDATE.
+        # Only reference columns that already exist on the table. New
+        # columns in batch_keys would raise KeyError here; the INSERT
+        # itself will then fail and _flush_batch retries after fixing
+        # the schema.
+        update_cols = {
+            c.name: sqlalchemy.func.coalesce(new_row[c.name], c)
+            for c in table.columns
+            if c.name != 'id' and c.name in batch_keys
+        }
+        if self.is_postgres:
+            stmt = stmt.on_conflict_do_update(
+                index_elements=['id'],
+                set_=update_cols,
+            )
+        else:
+            stmt = stmt.on_duplicate_key_update(**update_cols)
+        self.connection.execute(stmt)
+
+    def _flush_batch(self, table, batch, data_type_dict):
+        try:
+            self.bulk_upsert(table, batch)
+        except (
+            sqlalchemy.exc.CompileError,
+            sqlalchemy.exc.OperationalError,
+            sqlalchemy.exc.ProgrammingError,
+            sqlalchemy.exc.DataError,
+        ):
+            # Likely a schema mismatch; fix schema and retry once
+            for row_dict in batch:
+                table = self.make_table_compatible(
+                    table,
+                    row_dict,
+                    data_type_dict,
+                )
+            self.bulk_upsert(table, batch)
+        self._commit()
+
     def write_table(self, table_spec: TableSpec) -> None:
         table_name = table_spec.name
         headings = table_spec.headings
         data_type_dict = dict(zip_longest(headings, table_spec.data_types))
-        for i, row in enumerate(table_spec.rows):
-            row_dict = dict(zip(headings, row))
-            if i == 0:
-                table = self.get_table(table_name)
-                if table is None:
-                    table = self.create_table(
-                        table_name,
-                        row_dict,
-                        data_type_dict,
-                    )
-            # Checks the data type for every cell in every row. Maybe we
-            # can use a future version of the data dictionary to avoid
-            # this?
+
+        rows = (dict(zip(headings, row)) for row in table_spec.rows)
+        first_row = next(rows, None)
+        if first_row is None:
+            return
+        row_stream = itertools.chain([first_row], rows)
+
+        table = self.get_table(table_name)
+        if table is None:
+            table = self.create_table(table_name, first_row, data_type_dict)
-        rows = (dict(zip(headings, row)) for row in table_spec.rows)
-        first_row = next(rows, None)
-        if first_row is None:
-            return
-        row_stream = itertools.chain([first_row], rows)
-
-        table = self.get_table(table_name)
-        if table is None:
-            table = self.create_table(table_name, first_row, data_type_dict)
+        row_stream = (dict(zip(headings, row)) for row in table_spec.rows)
+        first_row = next(row_stream, None)
+
+        if first_row is None:
+            return
+
+        table = self.get_table(table_name)
+        if table is None:
+            table = self.create_table(table_name, first_row, data_type_dict)
+            self.upsert(table, first_row)
-        rows = (dict(zip(headings, row)) for row in table_spec.rows)
-        first_row = next(rows, None)
-        if first_row is None:
-            return
-        row_stream = itertools.chain([first_row], rows)
-
-        table = self.get_table(table_name)
-        if table is None:
-            table = self.create_table(table_name, first_row, data_type_dict)
+        row_stream = (dict(zip(headings, row)) for row in table_spec.rows)
+        first_row = next(row_stream, None)
+
+        if first_row is None:
+            return
+
+        table = self.get_table(table_name)
+        if table is None:
+            table = self.create_table(table_name, first_row, data_type_dict)
+            self.upsert(table, first_row)
+
+        for row_dict in itertools.islice(row_stream, SCHEMA_CHECK_ROWS):
             table = self.make_table_compatible(table, row_dict, data_type_dict)
             self.upsert(table, row_dict)
+        self._commit()
+
+        logger.debug(
+            "Schema check complete for %s rows in table '%s'. "
+            'Final columns: %s',
+            SCHEMA_CHECK_ROWS,
+            table_name,
+            [c.name for c in table.columns],
+        )
+
+        for batch in _batched(row_stream, BATCH_SIZE):
+            self._flush_batch(table, batch, data_type_dict)
 
     def _get_columns_for_data(self, row_dict, data_type_dict):
         return [self.get_id_column()] + [
@@ -638,3 +727,9 @@ def _get_columns_for_data(self, row_dict, data_type_dict):
                 and column_name != 'id'
             )
         ]
+
+
+# Use itertools.batched when Python is always >= 3.12
+def _batched(iterable, n):
+    while batch := list(itertools.islice(iterable, n)):
+        yield batch