Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include geocode/ons/constituency_centroids.psv
include geocode/ons/constituency_centroids_Dec2020.psv
include geocode/ons/lad_centroids_May2021.psv
include geocode/ons/nrs_2011.zip
include geocode/ons/nrs_2021.zip
include geocode/ons/nrs_2011.7z
include geocode/ons/nrs_2021.7z
include geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip
include geocode/code_point_open/codepo_gb.zip
Binary file added geocode/ons/PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z
Binary file not shown.
Binary file renamed geocode/ons/nrs_2021.zip → geocode/ons/nrs_2011.7z
Binary file not shown.
Binary file removed geocode/ons/nrs_2011.zip
Binary file not shown.
Binary file not shown.
75 changes: 40 additions & 35 deletions geocode/ons_nrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,16 @@
"""

import os
import sys
import zipfile
import json
import csv
import tempfile
import logging
from pathlib import Path
from typing import Literal, Optional, Iterable, Tuple, Union, List, Dict
from typing import Literal, Iterable, Tuple, Union, List, Dict

import pandas as pd
import geopandas as gpd
import shapefile

try:
from shapely.geometry import shape, Point
from shapely.ops import unary_union
from shapely.geometry import shape
except ImportError:
logging.warning(
"Failed to import Shapely library - you will not be able to reverse-geocode! "
Expand All @@ -45,13 +40,12 @@ def __init__(self, cache_manager, proxies=None, ssl_verify=True):
"""
self.cache_manager = cache_manager
self.data_dir = SCRIPT_DIR.joinpath("ons")
self.nrs_zipfile = self.data_dir.joinpath("nrs_2011.zip")
self.constituency_lookup_file = self.data_dir.joinpath(
"constituency_centroids_Dec2020.psv"
)
self.lad_lookup_file = self.data_dir.joinpath("lad_centroids_May2021.psv")
self.pc_llsoa_zipfile = self.data_dir.joinpath(
"PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.zip"
self.pc_llsoa_sevenzipfile = self.data_dir.joinpath(
"PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.7z"
)
self.llsoa_lookup = None
self.llsoa_regions = None
Expand Down Expand Up @@ -117,36 +111,40 @@ def _load_llsoa_lookup(self):
]
engwales_lookup.reset_index(names="code", inplace=True)

zip_path_2011 = self.data_dir.joinpath("nrs_2011.zip")
zip_path_2021 = self.data_dir.joinpath("nrs_2021.zip")
sevenzip_path_2011 = self.data_dir.joinpath("nrs_2011.7z")
sevenzip_path_2021 = self.data_dir.joinpath("nrs_2021.7z")

OA_2011_centroids = gpd.read_file(
f"zip://{zip_path_2011}!OutputArea2011_PWC_WGS84.csv",
columns=["code", "easting", "northing"],
OA_2011_centroids = utils.read_csv_from_7z(
sevenzip_path_2011,
"OutputArea2011_PWC_WGS84.csv",
usecols=["code", "easting", "northing"],
)
OA_2011_centroids = utils.add_latlon(OA_2011_centroids, "easting", "northing")
scots_lookup_2011 = OA_2011_centroids[["code", "latitude", "longitude"]]
OA_2021_centroids = gpd.read_file(
f"zip://{zip_path_2021}!OutputArea2022_PWC_WGS84.csv",
columns=["code", "easting", "northing"],
OA_2021_centroids = utils.read_csv_from_7z(
sevenzip_path_2021,
"OutputArea2022_PWC_WGS84.csv",
usecols=["code", "easting", "northing"],
)
OA_2021_centroids = utils.add_latlon(OA_2021_centroids, "easting", "northing")
scots_lookup_2021 = OA_2021_centroids[["code", "latitude", "longitude"]]
scots_lookup = pd.concat([scots_lookup_2011, scots_lookup_2021]).reset_index(
drop=True
)

DZ_2011_centroids = gpd.read_file(
f"zip://{zip_path_2011}!SG_DataZone_Cent_2011.csv",
columns=["DataZone", "Easting", "Northing"],
DZ_2011_centroids = utils.read_csv_from_7z(
sevenzip_path_2011,
"SG_DataZone_Cent_2011.csv",
usecols=["DataZone", "Easting", "Northing"],
)
DZ_2011_centroids = utils.add_latlon(DZ_2011_centroids, "Easting", "Northing")
scots_dz_lookup_2011 = DZ_2011_centroids[
["DataZone", "latitude", "longitude"]
].rename(columns={"DataZone": "code"})
DZ_2021_centroids = gpd.read_file(
f"zip://{zip_path_2021}!SG_DataZone_Cent_2022.csv",
columns=["DataZone", "Easting", "Northing"],
DZ_2021_centroids = utils.read_csv_from_7z(
sevenzip_path_2021,
"SG_DataZone_Cent_2022.csv",
usecols=["DataZone", "Easting", "Northing"],
)
DZ_2021_centroids = utils.add_latlon(DZ_2021_centroids, "Easting", "Northing")
scots_dz_lookup_2021 = DZ_2021_centroids[
Expand Down Expand Up @@ -204,19 +202,24 @@ def _load_llsoa_boundaries_engwales_regions(self, version: Literal["2011", "2021

def _load_llsoa_boundaries_scots_regions(self, version: Literal["2011", "2021"]):
"""
Load the LLSOA boundaries for Scotland from the NRS zipfile.
Load the LLSOA boundaries for Scotland from the NRS 7z file.

Parameters
----------
`version` : Literal["2011", "2021"]
The version of the LLSOA boundaries to load.
"""
zip_path = self.data_dir.joinpath(f"nrs_{version}.zip")
sevenzip_path = self.data_dir.joinpath(f"nrs_{version}.7z")
llsoa_filename = {
"2011": "OutputArea2011_EoR_WGS84.shp",
"2021": "OutputArea2022_EoR.shp",
"2011": "OutputArea2011_EoR_WGS84.geojson",
"2021": "OutputArea2022_EoR.geojson",
}
gdf = gpd.read_file(f"zip://{zip_path}!{llsoa_filename[version]}")
target_file = llsoa_filename[version]

with tempfile.TemporaryDirectory() as tmpdir:
utils.extract_from_7z(sevenzip_path, target_file, tmpdir)
extracted_file = Path(tmpdir) / target_file
gdf = gpd.read_file(extracted_file)
if version == "2021":
gdf.set_crs("EPSG:27700", inplace=True)
gdf.to_crs("EPSG:4326", inplace=True)
Expand Down Expand Up @@ -273,11 +276,9 @@ def _load_datazone_lookup(self, version: Literal["2011", "2021"]):
f"Loading {version} LLSOA<->Datazone lookup from cache {cache_label}"
)
return datazone_lookup_cache_contents
zip_path = self.data_dir.joinpath(f"nrs_{version}.zip")
sevenzip_path = self.data_dir.joinpath(f"nrs_{version}.7z")
dz_lookup_filename = {"2011": "OA_DZ_IZ_2011.csv", "2021": "OA22_DZ22_IZ22.csv"}
with zipfile.ZipFile(zip_path, "r") as nrs_zip:
with nrs_zip.open(dz_lookup_filename[version], "r") as fid:
dz_lookup = pd.read_csv(fid)
dz_lookup = utils.read_csv_from_7z(sevenzip_path, dz_lookup_filename[version])
if version == "2011":
dz_lookup.set_index("OutputArea2011Code", inplace=True)
dz_lookup.drop(columns=["IntermediateZone2011Code"], inplace=True)
Expand Down Expand Up @@ -522,7 +523,11 @@ def _load_postcode_llsoa_lookup(self):
"Loading postcode<->LLSOA lookup from cache ('%s')", "pc_llsoa_lookup"
)
return postcode_llsoa_lookup_cache_contents
pc_llsoa_lookup = pd.read_csv(self.pc_llsoa_zipfile, dtype=str)
pc_llsoa_lookup = utils.read_csv_from_7z(
self.pc_llsoa_sevenzipfile,
"PCD_OA_LSOA_MSOA_LAD_MAY22_UK_LU.csv",
dtype=str,
)
pc_llsoa_lookup["postcode"] = (
pc_llsoa_lookup.pcds.str.strip().str.upper().str.replace(" ", "")
)
Expand Down
60 changes: 60 additions & 0 deletions geocode/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,18 @@
- First Authored: 2022-10-19
"""

import io
import sys
import logging
import requests
import json
from pathlib import Path
from typing import Optional, Iterable, Tuple, Union, List, Dict

import geopandas as gpd
import pandas as pd
import pyproj
import py7zr


class GenericException(Exception):
Expand Down Expand Up @@ -423,3 +426,60 @@ def add_latlon(
df[lat_col] = df.geometry.y
df[lon_col] = df.geometry.x
return df


def extract_from_7z(archive_path: Path, file_to_extract: str, tmp_dir: str):
"""
Comment thread
Herald-TUOS marked this conversation as resolved.
Extract a file from a .7z archive.

Parameters
----------
`archive_path` : Path
Path to the .7z archive.
`file_to_extract` : str
The filename to extract (e.g. "OutputArea2011_EoR_WGS84.geojson").
`tmp_dir` : str
Path to the temporary directory to extract into.
"""
with py7zr.SevenZipFile(str(archive_path), mode="r") as archive:
archive.extract(path=tmp_dir, targets=[file_to_extract])


def read_csv_from_7z(archive_path: Path, file_to_read: str, **kwargs):
"""
Extract a CSV file from a .7z archive directly into memory.

Parameters
----------
`archive_path` : Path
Path to the .7z archive.
`file_to_read` : str
The CSV filename to extract.
`**kwargs`
Additional keyword arguments passed to `pd.read_csv`.

Returns
-------
`pd.DataFrame`
The extracted CSV file as a DataFrame.
"""

class InMemoryFactory:
"""
A factory for py7zr to extract files into memory rather than to disk.
"""

def __init__(self):
self.files = {}

def create(self, filename):
buffer = io.BytesIO()
self.files[filename] = buffer
return buffer

factory = InMemoryFactory()

with py7zr.SevenZipFile(str(archive_path), mode="r") as archive:
archive.extract(targets=[file_to_read], factory=factory)
factory.files[file_to_read].seek(0)
return pd.read_csv(factory.files[file_to_read], **kwargs)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ fiona
rtree
shapely>=1.7.0
pyshp
geopandas
geopandas
py7zr
Loading