From a3fd7152c7e98918b0e2bd3344693737559cf7dc Mon Sep 17 00:00:00 2001 From: Frederick Hoyles Date: Mon, 8 Jun 2026 13:15:27 +0200 Subject: [PATCH] Add get_table_data_iter and get_table_data_all_iter The new methods are iterators that yield from the response chunks from .fetch without collecting the full into a list, thus saving memory for large responses. Changes get_table_data and get_table_data_all to be thin wrappers around their corresponding _iter methods that only collect the full result into a list. --- src/pxweb/api.py | 88 ++++++++++++++++++++++++++++++++++++++++++----- tests/test_api.py | 22 ++++++++++++ 2 files changed, 101 insertions(+), 9 deletions(-) diff --git a/src/pxweb/api.py b/src/pxweb/api.py index cd964fc..c744efd 100644 --- a/src/pxweb/api.py +++ b/src/pxweb/api.py @@ -1,3 +1,4 @@ +from collections.abc import Iterator from concurrent.futures import ThreadPoolExecutor from logging import getLogger from typing import Literal, TypeAlias @@ -401,6 +402,41 @@ def get_table_data( │ 0192 Nynäshamn ┆ uppgift saknas ┆ Folkmängd ┆ 2024 ┆ 0 │ └─────────────────────┴────────────────┴────────────────┴──────┴───────┘ """ + return list( + self.get_table_data_iter(table_id, value_codes, code_list, show) + ) + + def get_table_data_iter( + self, + table_id: str, + value_codes: dict[str, list[str] | str] | None = None, + code_list: dict[str, str] | None = None, + show: Literal["code", "value", "code_value"] | None = None, + ) -> Iterator[dict]: + """ + Like `~~.PxApi.get_table_data`, but yields row dicts one at a time + instead of materialising the full dataset in memory before returning. + When a query is split into subqueries, rows are yielded as each + subquery completes, so processing can begin before all network calls + have finished. This is useful for streaming large tables to disk + without holding the whole result in RAM. + + Parameters + ---------- + table_id: str + An ID of a table to get data from. + value_codes: dict, optional + The value codes to use for data selection where the keys are the variable codes. You can use the `~~.PxApi.get_table_variables()` to explore what's available. + code_list: dict, optional + Any named code list to use with a variable for code selection. + show: str, optional + Set to "code_value", "code" or "value", to specify what to show in the categorical columns. + + Yields + ------ + : + One dict per data cell, in the same format as `~~.PxApi.get_table_data`. + """ # TODO support output_values if show not in (valid_show := {"code", "value", "code_value", None}): @@ -413,8 +449,8 @@ def get_table_data( response = self._client.call( endpoint=f"/tables/{table_id}/data", ) - dataset = unpack_table_data(response, show=show) - return dataset + yield from unpack_table_data(response, show=show) + return # A shallow copy to avoid unexpected mutation, e.g. turning a single item into a list value_codes = dict(value_codes) @@ -492,7 +528,6 @@ def fetch(query): value_codes, self._client.configuration["maxDataCells"] ) ] - dataset = [] if self.max_workers == 1: # 1 worker = sequential on main thread logger.debug( @@ -500,7 +535,7 @@ def fetch(query): len(subqueries), ) for subquery in subqueries: - dataset.extend(fetch(subquery)) + yield from fetch(subquery) else: logger.debug( "Fetching %s subqueries with %s workers", @@ -513,13 +548,11 @@ def fetch(query): ) as executor: # Map() so that we yield results in order for result in executor.map(fetch, subqueries): - dataset.extend(result) + yield from result else: # No batching needed so we just go ahead with the query as is query = build_query(value_codes, code_list) - dataset = fetch(query) - - return dataset + yield from fetch(query) def get_table_data_all( self, @@ -542,10 +575,47 @@ def get_table_data_all( : A dataset in a native format that can be loaded into a dataframe. """ + return list(self.get_table_data_all_iter(table_id, show=show)) + + def get_table_data_all_iter( + self, + table_id: str, + show: Literal["code", "value", "code_value"] | None = None, + ) -> Iterator[dict]: + """ + Like `~~.PxApi.get_table_data_all`, but yields row dicts one at a time + instead of materialising the full dataset in memory before returning. + Rows are yielded as each subquery completes, so processing can begin + before all network calls have finished. This makes it possible to + stream very large tables to disk without holding the whole result in + RAM. + + Parameters + ---------- + table_id: str + An ID of a table to get data from. + show: str, optional + Set to "code_value", "code" or "value", to specify what to show in the categorical columns. + + Yields + ------ + : + One dict per data cell, in the same format as `~~.PxApi.get_table_data_all`. + + Examples + -------- + Stream a large table to a newline-delimited JSON file without loading + the whole dataset into memory. + + >>> import json + >>> with open("TAB6683.ndjson", "w") as f: + ... for record in api.get_table_data_all_iter("TAB6683"): + ... f.write(json.dumps(record, ensure_ascii=False) + "\\n") + """ selection_all: dict[str, list[str] | str] = { k: ["*"] for k in self.get_table_variables(table_id) } - return self.get_table_data( + yield from self.get_table_data_iter( table_id, value_codes=selection_all, show=show ) diff --git a/tests/test_api.py b/tests/test_api.py index 9172ca5..190a262 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -38,6 +38,28 @@ def test_get_table_data(api): assert len(dataset) > 1 +def test_get_table_data_iter(api): + iterator = api.get_table_data_iter(table_id="TAB6471") + + assert not isinstance(iterator, list) + + rows = list(iterator) + assert all(isinstance(row, dict) for row in rows) + # The iterator should yield the same data as the list-returning method + assert rows == api.get_table_data(table_id="TAB6471") + + +def test_get_table_data_all_iter(api): + iterator = api.get_table_data_all_iter(table_id="TAB6471") + + assert not isinstance(iterator, list) + + rows = list(iterator) + assert len(rows) > 1 + assert all(isinstance(row, dict) and "value" in row for row in rows) + assert rows == api.get_table_data_all(table_id="TAB6471") + + def test_get_table_data_only_list_or_strings(api): with pytest.raises(ValueError): api.get_table_data(table_id="TAB6471", value_codes={"some_var": 42}) # type: ignore