Skip to content

Commit 783b746

Browse files
committed
cove: add check for large number of XLSX sheet rows
This commit fixes an issue where XLSX spreadsheets with a malformed number of rows generate very large unflattened JSON files. The fix simply checks for an excessive number of rows before passing off to cove/flattentool. The number of rows considered excessive is controllable through the MAX_XLSX_ROWS environment variable and set by default to 50000.
1 parent 5e2cc9c commit 783b746

5 files changed

Lines changed: 49 additions & 24 deletions

File tree

302 KB
Binary file not shown.
Binary file not shown.

cove/cove_360/tests/test_browser.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -150,31 +150,17 @@ def server_url(request, live_server):
150150
'bad currency',
151151
], True),
152152
('badfile_all_validation_errors_4_times.xlsx', [
153-
'Description is missing but required (more info about this error)',
154-
'id is missing but required within recipientOrganization (more info about this error)',
155-
'Date is not in the correct format (more info about this error)',
156-
'Amount Awarded is not a number. Check that the value is not null, and doesn’t contain any characters other than 0-9 and dot (.). Number values should not be in quotes.',
157-
'Invalid \'uri\' found (more info about this error)',
158-
'Invalid code found in Currency (more info about this error)',
159-
'[] is too short. You must supply at least one value, or remove the item entirely (unless it’s required).',
160-
# Context dates should be ISO formatted
161-
'2019-06-01T00:00:00+00:00',
162-
'bad date 1',
163-
'bad date 2',
164-
'bad date 3',
165-
'bad date 4',
166-
'This should be a number',
167-
'This should be a uri 1',
168-
'This should be a uri 2',
169-
# 'This should be a uri 3',
170-
'This should be a uri 5',
171-
'This should be a uri 6',
172-
# 'This should be a uri 7',
173-
'bad currency 1',
174-
'bad currency 2',
175-
'bad currency 3',
176-
'bad currency 4',
153+
'',
177154
], True),
155+
('badfile_too_many_rows.xlsx', [
156+
'This XLSX workbook has a worksheet (grants) with 50001 rows '
157+
'but the maximum number of rows supported by this tool is 50000'
158+
], False),
159+
('badfile_too_many_rows_multiple_sheets.xlsx', [
160+
'This XLSX workbook has worksheets with a larger number of rows '
161+
'than is supported by this tool (50000). Worksheets with too many rows: '
162+
'\'grants\' (50001 rows), \'grants_2\' (50002 rows)'
163+
], False),
178164
("dei_extension.xlsx", [
179165
"do not use the 360Giving Data Standard codelists correctly.",
180166
], True),

cove/cove_360/views.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import re
77
import os
8+
import zipfile
89
from decimal import Decimal
910

1011
from cove.views import explore_data_context, cove_web_input_error
@@ -17,6 +18,8 @@
1718
from django.utils.translation import gettext_lazy as _
1819
from django.core.cache import cache
1920

21+
import openpyxl
22+
2023
from libcove.config import LibCoveConfig
2124
from libcove.lib.converters import convert_spreadsheet, convert_json
2225
from libcove.lib.exceptions import CoveInputDataError
@@ -145,6 +148,40 @@ def explore_360(request, pk, template='cove_360/explore.html'):
145148
lib_cove_config=lib_cove_config))
146149

147150
else:
151+
if file_type == "xlsx":
152+
153+
# Check for an excessive number of rows before passing to flattentool.
154+
excessive_sheets = {}
155+
try:
156+
workbook = openpyxl.reader.excel.load_workbook(file_name, read_only=True)
157+
excessive_sheets = {
158+
sheetname: workbook[sheetname].max_row
159+
for sheetname in workbook.sheetnames
160+
if workbook[sheetname].max_row > settings.MAX_XLSX_ROWS
161+
}
162+
163+
except (zipfile.BadZipFile, openpyxl.utils.exceptions.InvalidFileException):
164+
# Exceptions associated with invalid spreadsheets are passed through for cove to handle.
165+
pass
166+
167+
if len(excessive_sheets) == 1:
168+
sheetname = next(iter(excessive_sheets))
169+
wrapped_err = f"This XLSX workbook has a worksheet ({sheetname}) with " \
170+
f"{excessive_sheets[sheetname]} rows " \
171+
f"but the maximum number of rows supported by this tool is {settings.MAX_XLSX_ROWS}"
172+
raise CoveInputDataError(wrapped_err=wrapped_err)
173+
elif len(excessive_sheets) > 1:
174+
wrapped_err = "This XLSX workbook has worksheets with a larger number of rows " \
175+
f"than is supported by this tool ({settings.MAX_XLSX_ROWS}). " \
176+
"Worksheets with too many rows: "
177+
wrapped_err += ", ".join(
178+
[
179+
f"'{sheetname}' ({num_rows} rows)"
180+
for sheetname, num_rows in excessive_sheets.items()
181+
]
182+
)
183+
raise CoveInputDataError(wrapped_err=wrapped_err)
184+
148185
# Convert spreadsheet to json
149186
context.update(convert_spreadsheet(upload_dir, upload_url, file_name, file_type, lib_cove_config, schema_360.schema_file,
150187
schema_360.pkg_schema_file))

cove/cove_project/settings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
MEDIA_URL=(str, "/media/"),
1313
STATIC_ROOT=(str, os.path.join(BASE_DIR, "static")),
1414
STATIC_URL=(str, "/static/"),
15+
MAX_XLSX_ROWS=(int, 50000),
1516
)
1617

1718
# We use the setting to choose whether to show the section about Sentry in the
@@ -35,6 +36,7 @@
3536
SECRET_KEY = settings.SECRET_KEY
3637
DEBUG = settings.DEBUG
3738
ALLOWED_HOSTS = settings.ALLOWED_HOSTS
39+
MAX_XLSX_ROWS = env("MAX_XLSX_ROWS")
3840

3941
MIDDLEWARE = (
4042
'django.contrib.sessions.middleware.SessionMiddleware',

0 commit comments

Comments
 (0)