diff --git a/MANIFEST.in b/MANIFEST.in index 9716f34..d0ca67a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,7 @@ include geocode/ons/constituency_centroids.psv include geocode/ons/constituency_centroids_Dec2020.psv include geocode/ons/lad_centroids_May2021.psv -include geocode/ons/nrs_2011.zip -include geocode/ons/nrs_2021.zip +include geocode/ons/nrs_2011.7z +include geocode/ons/nrs_2021.7z include geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip include geocode/code_point_open/codepo_gb.zip diff --git a/geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z b/geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z new file mode 100644 index 0000000..efa3ea2 Binary files /dev/null and b/geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z differ diff --git a/geocode/ons/nrs_2021.zip b/geocode/ons/nrs_2011.7z similarity index 76% rename from geocode/ons/nrs_2021.zip rename to geocode/ons/nrs_2011.7z index a7c9a70..310f79d 100644 Binary files a/geocode/ons/nrs_2021.zip and b/geocode/ons/nrs_2011.7z differ diff --git a/geocode/ons/nrs_2011.zip b/geocode/ons/nrs_2011.zip deleted file mode 100644 index 4a3a24d..0000000 Binary files a/geocode/ons/nrs_2011.zip and /dev/null differ diff --git a/geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip b/geocode/ons/nrs_2021.7z similarity index 67% rename from geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip rename to geocode/ons/nrs_2021.7z index bc29b3d..6f42ce9 100644 Binary files a/geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip and b/geocode/ons/nrs_2021.7z differ diff --git a/geocode/ons_nrs.py b/geocode/ons_nrs.py index d487380..ab17f57 100644 --- a/geocode/ons_nrs.py +++ b/geocode/ons_nrs.py @@ -7,21 +7,16 @@ """ import os -import sys -import zipfile -import json -import csv +import tempfile import logging from pathlib import Path -from typing import Literal, Optional, Iterable, Tuple, Union, List, Dict +from typing import Literal, Iterable, Tuple, Union, List, Dict import pandas as pd import geopandas as gpd -import shapefile try: - from shapely.geometry import shape, Point - from shapely.ops import unary_union + from shapely.geometry import shape except ImportError: logging.warning( "Failed to import Shapely library - you will not be able to reverse-geocode! " @@ -45,13 +40,12 @@ def __init__(self, cache_manager, proxies=None, ssl_verify=True): """ self.cache_manager = cache_manager self.data_dir = SCRIPT_DIR.joinpath("ons") - self.nrs_zipfile = self.data_dir.joinpath("nrs_2011.zip") self.constituency_lookup_file = self.data_dir.joinpath( "constituency_centroids_Dec2020.psv" ) self.lad_lookup_file = self.data_dir.joinpath("lad_centroids_May2021.psv") - self.pc_llsoa_zipfile = self.data_dir.joinpath( - "PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip" + self.pc_llsoa_sevenzipfile = self.data_dir.joinpath( + "PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z" ) self.llsoa_lookup = None self.llsoa_regions = None @@ -117,18 +111,20 @@ def _load_llsoa_lookup(self): ] engwales_lookup.reset_index(names="code", inplace=True) - zip_path_2011 = self.data_dir.joinpath("nrs_2011.zip") - zip_path_2021 = self.data_dir.joinpath("nrs_2021.zip") + sevenzip_path_2011 = self.data_dir.joinpath("nrs_2011.7z") + sevenzip_path_2021 = self.data_dir.joinpath("nrs_2021.7z") - OA_2011_centroids = gpd.read_file( - f"zip://{zip_path_2011}!OutputArea2011_PWC_WGS84.csv", - columns=["code", "easting", "northing"], + OA_2011_centroids = utils.read_csv_from_7z( + sevenzip_path_2011, + "OutputArea2011_PWC_WGS84.csv", + usecols=["code", "easting", "northing"], ) OA_2011_centroids = utils.add_latlon(OA_2011_centroids, "easting", "northing") scots_lookup_2011 = OA_2011_centroids[["code", "latitude", "longitude"]] - OA_2021_centroids = gpd.read_file( - f"zip://{zip_path_2021}!OutputArea2022_PWC_WGS84.csv", - columns=["code", "easting", "northing"], + OA_2021_centroids = utils.read_csv_from_7z( + sevenzip_path_2021, + "OutputArea2022_PWC_WGS84.csv", + usecols=["code", "easting", "northing"], ) OA_2021_centroids = utils.add_latlon(OA_2021_centroids, "easting", "northing") scots_lookup_2021 = OA_2021_centroids[["code", "latitude", "longitude"]] @@ -136,17 +132,19 @@ def _load_llsoa_lookup(self): drop=True ) - DZ_2011_centroids = gpd.read_file( - f"zip://{zip_path_2011}!SG_DataZone_Cent_2011.csv", - columns=["DataZone", "Easting", "Northing"], + DZ_2011_centroids = utils.read_csv_from_7z( + sevenzip_path_2011, + "SG_DataZone_Cent_2011.csv", + usecols=["DataZone", "Easting", "Northing"], ) DZ_2011_centroids = utils.add_latlon(DZ_2011_centroids, "Easting", "Northing") scots_dz_lookup_2011 = DZ_2011_centroids[ ["DataZone", "latitude", "longitude"] ].rename(columns={"DataZone": "code"}) - DZ_2021_centroids = gpd.read_file( - f"zip://{zip_path_2021}!SG_DataZone_Cent_2022.csv", - columns=["DataZone", "Easting", "Northing"], + DZ_2021_centroids = utils.read_csv_from_7z( + sevenzip_path_2021, + "SG_DataZone_Cent_2022.csv", + usecols=["DataZone", "Easting", "Northing"], ) DZ_2021_centroids = utils.add_latlon(DZ_2021_centroids, "Easting", "Northing") scots_dz_lookup_2021 = DZ_2021_centroids[ @@ -204,19 +202,24 @@ def _load_llsoa_boundaries_engwales_regions(self, version: Literal["2011", "2021 def _load_llsoa_boundaries_scots_regions(self, version: Literal["2011", "2021"]): """ - Load the LLSOA boundaries for Scotland from the NRS zipfile. + Load the LLSOA boundaries for Scotland from the NRS 7z file. Parameters ---------- `version` : Literal["2011", "2021"] The version of the LLSOA boundaries to load. """ - zip_path = self.data_dir.joinpath(f"nrs_{version}.zip") + sevenzip_path = self.data_dir.joinpath(f"nrs_{version}.7z") llsoa_filename = { - "2011": "OutputArea2011_EoR_WGS84.shp", - "2021": "OutputArea2022_EoR.shp", + "2011": "OutputArea2011_EoR_WGS84.geojson", + "2021": "OutputArea2022_EoR.geojson", } - gdf = gpd.read_file(f"zip://{zip_path}!{llsoa_filename[version]}") + target_file = llsoa_filename[version] + + with tempfile.TemporaryDirectory() as tmpdir: + utils.extract_from_7z(sevenzip_path, target_file, tmpdir) + extracted_file = Path(tmpdir) / target_file + gdf = gpd.read_file(extracted_file) if version == "2021": gdf.set_crs("EPSG:27700", inplace=True) gdf.to_crs("EPSG:4326", inplace=True) @@ -273,11 +276,9 @@ def _load_datazone_lookup(self, version: Literal["2011", "2021"]): f"Loading {version} LLSOA<->Datazone lookup from cache {cache_label}" ) return datazone_lookup_cache_contents - zip_path = self.data_dir.joinpath(f"nrs_{version}.zip") + sevenzip_path = self.data_dir.joinpath(f"nrs_{version}.7z") dz_lookup_filename = {"2011": "OA_DZ_IZ_2011.csv", "2021": "OA22_DZ22_IZ22.csv"} - with zipfile.ZipFile(zip_path, "r") as nrs_zip: - with nrs_zip.open(dz_lookup_filename[version], "r") as fid: - dz_lookup = pd.read_csv(fid) + dz_lookup = utils.read_csv_from_7z(sevenzip_path, dz_lookup_filename[version]) if version == "2011": dz_lookup.set_index("OutputArea2011Code", inplace=True) dz_lookup.drop(columns=["IntermediateZone2011Code"], inplace=True) @@ -522,7 +523,11 @@ def _load_postcode_llsoa_lookup(self): "Loading postcode<->LLSOA lookup from cache ('%s')", "pc_llsoa_lookup" ) return postcode_llsoa_lookup_cache_contents - pc_llsoa_lookup = pd.read_csv(self.pc_llsoa_zipfile, dtype=str) + pc_llsoa_lookup = utils.read_csv_from_7z( + self.pc_llsoa_sevenzipfile, + "PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.csv", + dtype=str, + ) pc_llsoa_lookup["postcode"] = ( pc_llsoa_lookup.pcds.str.strip().str.upper().str.replace(" ", "") ) diff --git a/geocode/utilities.py b/geocode/utilities.py index 34b88c5..ce69746 100644 --- a/geocode/utilities.py +++ b/geocode/utilities.py @@ -6,15 +6,18 @@ - First Authored: 2022-10-19 """ +import io import sys import logging import requests import json +from pathlib import Path from typing import Optional, Iterable, Tuple, Union, List, Dict import geopandas as gpd import pandas as pd import pyproj +import py7zr class GenericException(Exception): @@ -423,3 +426,60 @@ def add_latlon( df[lat_col] = df.geometry.y df[lon_col] = df.geometry.x return df + + +def extract_from_7z(archive_path: Path, file_to_extract: str, tmp_dir: str): + """ + Extract a file from a .7z archive. + + Parameters + ---------- + `archive_path` : Path + Path to the .7z archive. + `file_to_extract` : str + The filename to extract (e.g. "OutputArea2011_EoR_WGS84.geojson"). + `tmp_dir` : str + Path to the temporary directory to extract into. + """ + with py7zr.SevenZipFile(str(archive_path), mode="r") as archive: + archive.extract(path=tmp_dir, targets=[file_to_extract]) + + +def read_csv_from_7z(archive_path: Path, file_to_read: str, **kwargs): + """ + Extract a CSV file from a .7z archive directly into memory. + + Parameters + ---------- + `archive_path` : Path + Path to the .7z archive. + `file_to_read` : str + The CSV filename to extract. + `**kwargs` + Additional keyword arguments passed to `pd.read_csv`. + + Returns + ------- + `pd.DataFrame` + The extracted CSV file as a DataFrame. + """ + + class InMemoryFactory: + """ + A factory for py7zr to extract files into memory rather than to disk. + """ + + def __init__(self): + self.files = {} + + def create(self, filename): + buffer = io.BytesIO() + self.files[filename] = buffer + return buffer + + factory = InMemoryFactory() + + with py7zr.SevenZipFile(str(archive_path), mode="r") as archive: + archive.extract(targets=[file_to_read], factory=factory) + factory.files[file_to_read].seek(0) + return pd.read_csv(factory.files[file_to_read], **kwargs) diff --git a/requirements.txt b/requirements.txt index f7d5709..ab47916 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,4 +9,5 @@ fiona rtree shapely>=1.7.0 pyshp -geopandas \ No newline at end of file +geopandas +py7zr \ No newline at end of file