From d53e637f601c8d0d3b2e7f95f824522b35cabf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 1 Feb 2022 20:08:41 +0100 Subject: [PATCH 01/49] Move oonidata related functions into a separate package --- af/fastpath/fastpath/core.py | 8 +-- af/fastpath/fastpath/db.py | 2 +- af/fastpath/fastpath/metrics.py | 24 --------- af/fastpath/fastpath/oonidata | 1 + af/fastpath/fastpath/reprocessor.py | 4 +- af/fastpath/fastpath/sshfeeder.py | 6 +-- af/fastpath/fastpath/tests/test_functional.py | 2 +- .../tests/test_functional_normalize.py | 4 +- af/fastpath/fastpath/tests/test_unit.py | 4 +- af/fastpath/setup.py | 2 +- oonidata/oonidata/__init__.py | 0 oonidata/oonidata/main.py | 46 ++++++++++++++++ oonidata/oonidata/metrics.py | 54 +++++++++++++++++++ .../fastpath => oonidata/oonidata}/mytypes.py | 0 .../oonidata}/normalize.py | 0 .../oonidata}/s3feeder.py | 8 +-- .../fastpath => oonidata/oonidata}/utils.py | 0 oonidata/setup.py | 19 +++++++ 18 files changed, 140 insertions(+), 44 deletions(-) delete mode 100644 af/fastpath/fastpath/metrics.py create mode 120000 af/fastpath/fastpath/oonidata create mode 100644 oonidata/oonidata/__init__.py create mode 100644 oonidata/oonidata/main.py create mode 100644 oonidata/oonidata/metrics.py rename {af/fastpath/fastpath => oonidata/oonidata}/mytypes.py (100%) rename {af/fastpath/fastpath => oonidata/oonidata}/normalize.py (100%) rename {af/fastpath/fastpath => oonidata/oonidata}/s3feeder.py (98%) rename {af/fastpath/fastpath => oonidata/oonidata}/utils.py (100%) create mode 100644 oonidata/setup.py diff --git a/af/fastpath/fastpath/core.py b/af/fastpath/fastpath/core.py index af1a727c..5c597e45 100644 --- a/af/fastpath/fastpath/core.py +++ b/af/fastpath/fastpath/core.py @@ -34,7 +34,7 @@ no_journal_handler = True # Feeds measurements from S3 -import fastpath.s3feeder as s3feeder +import fastpath.oonidata.s3feeder as s3feeder # Feeds measurements from a local HTTP API from fastpath.localhttpfeeder import start_http_api @@ -43,10 +43,10 @@ import fastpath.db as db from fastpath.metrics import setup_metrics -from fastpath.mytypes import MsmtTup +from fastpath.oonidata.mytypes import MsmtTup import fastpath.portable_queue as queue -import fastpath.utils +import fastpath.oonidata.utils LOCALITY_VALS = ("general", "global", "country", "isp", "local") @@ -1600,7 +1600,7 @@ def setup_fingerprints(): fingerprints = { "ZZ": {"body_match": [], "header_prefix": [], "header_full": [], "dns_full": []} } - for cc, fprints in fastpath.utils.fingerprints.items(): + for cc, fprints in fastpath.oonidata.utils.fingerprints.items(): d = fingerprints.setdefault(cc, {}) for fp in fprints: assert fp["locality"] in LOCALITY_VALS, fp["locality"] diff --git a/af/fastpath/fastpath/db.py b/af/fastpath/fastpath/db.py index 32193a36..6c3d781f 100644 --- a/af/fastpath/fastpath/db.py +++ b/af/fastpath/fastpath/db.py @@ -19,7 +19,7 @@ from clickhouse_driver import Client as Clickhouse import ujson -from fastpath.metrics import setup_metrics +from fastpath.oonidata.metrics import setup_metrics log = logging.getLogger("fastpath.db") metrics = setup_metrics(name="fastpath.db") diff --git a/af/fastpath/fastpath/metrics.py b/af/fastpath/fastpath/metrics.py deleted file mode 100644 index 470ec975..00000000 --- a/af/fastpath/fastpath/metrics.py +++ /dev/null @@ -1,24 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Metric generation -""" - -from os.path import basename, splitext - -import statsd # debdeps: python3-statsd - - -def setup_metrics(host="localhost", name=None): - """Setup metric generation. Use dotted namespaces e.g. - "pipeline.centrifugation" - """ - if name is None: - import __main__ - - prefix = splitext(basename(__main__.__file__))[0] - else: - prefix = name - - prefix = prefix.strip(".") - return statsd.StatsClient(host, 8125, prefix=prefix) diff --git a/af/fastpath/fastpath/oonidata b/af/fastpath/fastpath/oonidata new file mode 120000 index 00000000..21aa3ae4 --- /dev/null +++ b/af/fastpath/fastpath/oonidata @@ -0,0 +1 @@ +../../../oonidata/oonidata \ No newline at end of file diff --git a/af/fastpath/fastpath/reprocessor.py b/af/fastpath/fastpath/reprocessor.py index 4e598b5d..bdb89499 100755 --- a/af/fastpath/fastpath/reprocessor.py +++ b/af/fastpath/fastpath/reprocessor.py @@ -45,9 +45,9 @@ import statsd # debdeps: python3-statsd import fastpath.db as db -import fastpath.s3feeder as s3f +import fastpath.oonidata.s3feeder as s3f from fastpath.core import score_measurement, setup_fingerprints, unwrap_msmt -from fastpath.utils import trivial_id +from fastpath.oonidata.utils import trivial_id metrics = statsd.StatsClient("127.0.0.1", 8125, prefix="reprocessor") log = logging.getLogger("reprocessor") diff --git a/af/fastpath/fastpath/sshfeeder.py b/af/fastpath/fastpath/sshfeeder.py index fcb484a4..c0cf8332 100755 --- a/af/fastpath/fastpath/sshfeeder.py +++ b/af/fastpath/fastpath/sshfeeder.py @@ -23,9 +23,9 @@ warnings.filterwarnings(action="ignore", module=".*paramiko.*") -import fastpath.normalize as normalize # noqa -from fastpath.metrics import setup_metrics # noqa -from fastpath.mytypes import MsmtTup +import fastpath.oonidata.normalize as normalize # noqa +from fastpath.oonidata.metrics import setup_metrics # noqa +from fastpath.oonidata.mytypes import MsmtTup log = logging.getLogger("fastpath") diff --git a/af/fastpath/fastpath/tests/test_functional.py b/af/fastpath/fastpath/tests/test_functional.py index 8bec8b46..e6a2cea6 100644 --- a/af/fastpath/fastpath/tests/test_functional.py +++ b/af/fastpath/fastpath/tests/test_functional.py @@ -15,7 +15,7 @@ import pytest # debdeps: python3-pytest import fastpath.core as fp -import fastpath.s3feeder as s3feeder +import fastpath.oonidata.s3feeder as s3feeder log = logging.getLogger() diff --git a/af/fastpath/fastpath/tests/test_functional_normalize.py b/af/fastpath/fastpath/tests/test_functional_normalize.py index c77f7bf0..6f489267 100644 --- a/af/fastpath/fastpath/tests/test_functional_normalize.py +++ b/af/fastpath/fastpath/tests/test_functional_normalize.py @@ -16,8 +16,8 @@ import tarfile import ujson -import fastpath.normalize as norm -from fastpath.s3feeder import create_s3_client +import fastpath.oonidata.normalize as norm +from fastpath.oonidata.s3feeder import create_s3_client log = logging.getLogger() diff --git a/af/fastpath/fastpath/tests/test_unit.py b/af/fastpath/fastpath/tests/test_unit.py index 88c0f141..ef9a1efc 100644 --- a/af/fastpath/fastpath/tests/test_unit.py +++ b/af/fastpath/fastpath/tests/test_unit.py @@ -6,9 +6,9 @@ import ujson -from fastpath.utils import trivial_id +from fastpath.oonidata.utils import trivial_id import fastpath.core as fp -import fastpath.s3feeder as s3feeder +import fastpath.oonidata.s3feeder as s3feeder def test_trivial_id(): diff --git a/af/fastpath/setup.py b/af/fastpath/setup.py index f34f9153..0150efed 100644 --- a/af/fastpath/setup.py +++ b/af/fastpath/setup.py @@ -11,7 +11,7 @@ setup( name=NAME, python_requires=">=3.7.0", - packages=["fastpath", "fastpath.tests"], + packages=["fastpath", "fastpath.oonidata", "fastpath.tests"], entry_points={"console_scripts": [ "fastpath=fastpath.core:main", "reprocessor=fastpath.reprocessor:main", diff --git a/oonidata/oonidata/__init__.py b/oonidata/oonidata/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py new file mode 100644 index 00000000..8a56d0f6 --- /dev/null +++ b/oonidata/oonidata/main.py @@ -0,0 +1,46 @@ +import argparse +from collections import namedtuple +import datetime as dt +import pathlib +import sys +from typing import List + +from .s3feeder import stream_cans + +Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) + +def sync(args): + conf = Config( + ccs=args.country, + testnames=args.test_name, + keep_s3_cache=True, + s3cachedir=args.output_dir + ) + for msmt in stream_cans(conf, args.first_date, args.last_date): + print(msmt) + +def _parse_date_flag(date_str: str) -> dt.date: + return dt.datetime.strptime(date_str, "%Y-%m-%d").date() + +def main(): + parser = argparse.ArgumentParser("OONI Data tools") + + subparsers = parser.add_subparsers() + + parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") + parser_sync.add_argument("--country", type=str, required=True) + parser_sync.add_argument("--first_date", type=_parse_date_flag, + default=dt.date.today() - dt.timedelta(days=14)) + parser_sync.add_argument("--last_date", type=_parse_date_flag, + default=dt.date.today()) + parser_sync.add_argument("--test_name", type=str, default='webconnectivity') + parser_sync.add_argument("--max_string_size", type=int, default=1000) + parser_sync.add_argument("--output_dir", type=pathlib.Path, required=True) + parser_sync.add_argument("--debug", action="store_true") + parser_sync.set_defaults(func=sync) + + args = parser.parse_args() + sys.exit(args.func(args)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/oonidata/oonidata/metrics.py b/oonidata/oonidata/metrics.py new file mode 100644 index 00000000..b26ee570 --- /dev/null +++ b/oonidata/oonidata/metrics.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +""" +Metric generation +""" + +from os.path import basename, splitext +from functools import wraps + +class MockTimer(object): + def __call__(self, f): + @wraps(f) + def _f(*a, **k): + return f(*a, **k) + return _f + +class MockStatsClient(object): + """ + API compatible with the statsd client, but does nothing. + """ + def __init__(self, host=None, port=None, prefix=None, sample_rate=None): + pass + def incr(self, stat, count=1, rate=1): + pass + def decr(self, stat, count=1, rate=1): + pass + def gauge(self, stat, value, rate=1, delta=False): + pass + def set(self, stat, value, rate=1): + pass + def timer(self, stat, rate): + return MockTimer() + +_STATSD_AVAILABLE = True +try: + import statsd # debdeps: python3-statsd + statsdclient = statsd.StatsClient +except ImportError: + _STATSD_AVAILABLE = False + statsdclient = MockStatsClient + +def setup_metrics(host="localhost", name=None): + """Setup metric generation. Use dotted namespaces e.g. + "pipeline.centrifugation" + """ + if name is None: + import __main__ + + prefix = splitext(basename(__main__.__file__))[0] + else: + prefix = name + + prefix = prefix.strip(".") + return statsd.StatsClient(host, 8125, prefix=prefix) diff --git a/af/fastpath/fastpath/mytypes.py b/oonidata/oonidata/mytypes.py similarity index 100% rename from af/fastpath/fastpath/mytypes.py rename to oonidata/oonidata/mytypes.py diff --git a/af/fastpath/fastpath/normalize.py b/oonidata/oonidata/normalize.py similarity index 100% rename from af/fastpath/fastpath/normalize.py rename to oonidata/oonidata/normalize.py diff --git a/af/fastpath/fastpath/s3feeder.py b/oonidata/oonidata/s3feeder.py similarity index 98% rename from af/fastpath/fastpath/s3feeder.py rename to oonidata/oonidata/s3feeder.py index 179e908d..54757f0c 100644 --- a/af/fastpath/fastpath/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -26,10 +26,10 @@ from botocore import UNSIGNED as botoSigUNSIGNED from botocore.config import Config as botoConfig -from fastpath.metrics import setup_metrics -from fastpath.mytypes import MsmtTup # msmt bytes, msmt dict, uid -from fastpath.normalize import iter_yaml_msmt_normalized -from fastpath.utils import trivial_id +from .metrics import setup_metrics +from .mytypes import MsmtTup # msmt bytes, msmt dict, uid +from .normalize import iter_yaml_msmt_normalized +from .utils import trivial_id CAN_BUCKET_NAME = "ooni-data" MC_BUCKET_NAME = "ooni-data-eu-fra" diff --git a/af/fastpath/fastpath/utils.py b/oonidata/oonidata/utils.py similarity index 100% rename from af/fastpath/fastpath/utils.py rename to oonidata/oonidata/utils.py diff --git a/oonidata/setup.py b/oonidata/setup.py new file mode 100644 index 00000000..b3ad0b10 --- /dev/null +++ b/oonidata/setup.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from setuptools import setup + +setup( + name="oonidata", + python_requires=">=3.7.0", + packages=["oonidata"], + entry_points={"console_scripts": [ + "oonidata=oonidata:main", + "reprocessor=fastpath.reprocessor:main", + "domain_input_updater=fastpath.domain_input:main", + ]}, + install_requires=REQUIRED, + include_package_data=True, + zip_safe=False, + package_data={'fastpath': ['views/*.tpl', 'static/*']}, +) From dc164f726ac196657191c4a9de7285585e37b596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 2 Feb 2022 19:36:22 +0100 Subject: [PATCH 02/49] Implement sync functionality This is very heavily based on the work of @fortuna in https://github.com/Jigsaw-Code/net-analysis/. The code from there has been adapted to maxmise the usage of existing ooni/pipeline code. --- oonidata/oonidata/main.py | 78 ++++++++++++++++++++++++++++++++---- oonidata/oonidata/metrics.py | 5 ++- oonidata/setup.py | 6 +-- 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 8a56d0f6..33f70d72 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -1,23 +1,87 @@ import argparse +import shutil from collections import namedtuple +import tempfile +import os +import gzip +import itertools +import logging import datetime as dt import pathlib import sys -from typing import List +import time +from typing import List, Generator, Tuple, List -from .s3feeder import stream_cans +import ujson + +from .s3feeder import stream_cans, load_multiple +from .s3feeder import list_cans_on_s3_for_a_day, list_minicans_on_s3_for_a_day, fetch_cans +from .s3feeder import create_s3_client, _calculate_etr Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) +FileEntry = namedtuple("FileEntry", ["country", "test_name", "date", "basename"]) + +log = logging.getLogger("oonidata") +logging.basicConfig(level=logging.INFO) def sync(args): + s3cachedir = tempfile.TemporaryDirectory() conf = Config( ccs=args.country, testnames=args.test_name, - keep_s3_cache=True, - s3cachedir=args.output_dir + keep_s3_cache=False, + s3cachedir=pathlib.Path(s3cachedir.name) ) - for msmt in stream_cans(conf, args.first_date, args.last_date): - print(msmt) + t0 = time.time() + day = args.first_date + today = dt.date.today() + stop_day = args.last_date if args.last_date < today else today + s3 = create_s3_client() + while day < stop_day: + cans_fns = list_cans_on_s3_for_a_day(s3, day) + minicans_fns = list_minicans_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) + cans_fns.extend(minicans_fns) + + log.info(f"Downloading {len(cans_fns)} cans") + test_name = args.test_name.replace("_", "") + for cn, can_tuple in enumerate(cans_fns): + s3fname, size = can_tuple + basename = pathlib.Path(s3fname).name + if not basename.endswith(".jsonl.gz"): + basename = basename.rsplit('.', 2)[0] + '.jsonl.gz' + dst_path = args.output_dir / args.country / test_name / f"{day:%Y-%m-%d}" / basename + if dst_path.is_file(): + continue + os.makedirs(dst_path.parent, exist_ok=True) + temp_path = dst_path.with_name(f"{dst_path.name}.tmp") + try: + with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: + for can_f in fetch_cans(s3, conf, [can_tuple]): + try: + etr = _calculate_etr(t0, time.time(), args.first_date, day, stop_day, cn, len(cans_fns)) + log.info(f"Estimated time remaining: {etr}") + for msmt_tup in load_multiple(can_f.as_posix()): + msmt = msmt_tup[1] + if msmt["test_name"].replace("_", "") != test_name: + continue + if msmt["probe_cc"] != args.country: + continue + ujson.dump(msmt, out_file) + out_file.write("\n") + except Exception as e: + log.error(str(e), exc_info=True) + try: + can_f.unlink() + except FileNotFoundError: + pass + temp_path.replace(dst_path) + except: + temp_path.unlink() + s3cachedir.cleanup() + raise + + day += dt.timedelta(days=1) + s3cachedir.cleanup() def _parse_date_flag(date_str: str) -> dt.date: return dt.datetime.strptime(date_str, "%Y-%m-%d").date() @@ -43,4 +107,4 @@ def main(): sys.exit(args.func(args)) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/oonidata/oonidata/metrics.py b/oonidata/oonidata/metrics.py index b26ee570..e75310ba 100644 --- a/oonidata/oonidata/metrics.py +++ b/oonidata/oonidata/metrics.py @@ -4,9 +4,12 @@ Metric generation """ +import logging from os.path import basename, splitext from functools import wraps +log = logging.getLogger("fastpath") + class MockTimer(object): def __call__(self, f): @wraps(f) @@ -25,7 +28,7 @@ def incr(self, stat, count=1, rate=1): def decr(self, stat, count=1, rate=1): pass def gauge(self, stat, value, rate=1, delta=False): - pass + log.info(f"{stat}: {value}") def set(self, stat, value, rate=1): pass def timer(self, stat, rate): diff --git a/oonidata/setup.py b/oonidata/setup.py index b3ad0b10..aeece5e1 100644 --- a/oonidata/setup.py +++ b/oonidata/setup.py @@ -9,10 +9,10 @@ packages=["oonidata"], entry_points={"console_scripts": [ "oonidata=oonidata:main", - "reprocessor=fastpath.reprocessor:main", - "domain_input_updater=fastpath.domain_input:main", ]}, - install_requires=REQUIRED, + install_requires=[ + "" + ], include_package_data=True, zip_safe=False, package_data={'fastpath': ['views/*.tpl', 'static/*']}, From 6f2f6029aedf7195f7cc4cfc58e75abec4bdb0c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 2 Feb 2022 19:39:25 +0100 Subject: [PATCH 03/49] Fix broken metrics import --- af/fastpath/fastpath/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/af/fastpath/fastpath/core.py b/af/fastpath/fastpath/core.py index 5c597e45..e17ace1f 100644 --- a/af/fastpath/fastpath/core.py +++ b/af/fastpath/fastpath/core.py @@ -42,7 +42,7 @@ # Push measurements into Postgres import fastpath.db as db -from fastpath.metrics import setup_metrics +from fastpath.oonidata.metrics import setup_metrics from fastpath.oonidata.mytypes import MsmtTup import fastpath.portable_queue as queue From 1c51f5ae19770c5caf7914bfe52888a90df3c721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 2 Feb 2022 19:46:52 +0100 Subject: [PATCH 04/49] Use mocked statsd client when import is not available --- oonidata/oonidata/metrics.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/oonidata/oonidata/metrics.py b/oonidata/oonidata/metrics.py index e75310ba..a39a6f0e 100644 --- a/oonidata/oonidata/metrics.py +++ b/oonidata/oonidata/metrics.py @@ -34,12 +34,10 @@ def set(self, stat, value, rate=1): def timer(self, stat, rate): return MockTimer() -_STATSD_AVAILABLE = True try: import statsd # debdeps: python3-statsd statsdclient = statsd.StatsClient except ImportError: - _STATSD_AVAILABLE = False statsdclient = MockStatsClient def setup_metrics(host="localhost", name=None): @@ -54,4 +52,4 @@ def setup_metrics(host="localhost", name=None): prefix = name prefix = prefix.strip(".") - return statsd.StatsClient(host, 8125, prefix=prefix) + return statsdclient(host, 8125, prefix=prefix) From d2c302d06882d26e897bb385fc326d8a337c769f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 2 Feb 2022 19:51:07 +0100 Subject: [PATCH 05/49] Fixup setup.py --- oonidata/setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/oonidata/setup.py b/oonidata/setup.py index aeece5e1..146033e3 100644 --- a/oonidata/setup.py +++ b/oonidata/setup.py @@ -11,9 +11,10 @@ "oonidata=oonidata:main", ]}, install_requires=[ - "" + "boto3", + "pyyaml", + "ujson", + "lz4" ], - include_package_data=True, zip_safe=False, - package_data={'fastpath': ['views/*.tpl', 'static/*']}, ) From 3b3291700e06e392ec8a1a009dd82217e948f31d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 4 Feb 2022 11:53:59 +0100 Subject: [PATCH 06/49] Fix script path --- oonidata/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/setup.py b/oonidata/setup.py index 146033e3..e875ac42 100644 --- a/oonidata/setup.py +++ b/oonidata/setup.py @@ -8,7 +8,7 @@ python_requires=">=3.7.0", packages=["oonidata"], entry_points={"console_scripts": [ - "oonidata=oonidata:main", + "oonidata=oonidata.main:main", ]}, install_requires=[ "boto3", From cd28117d8ca5fe94632c8972d6e9ff040360e37f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 4 Feb 2022 11:55:05 +0100 Subject: [PATCH 07/49] Fix timer mock --- oonidata/oonidata/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/metrics.py b/oonidata/oonidata/metrics.py index a39a6f0e..8fbea630 100644 --- a/oonidata/oonidata/metrics.py +++ b/oonidata/oonidata/metrics.py @@ -31,7 +31,7 @@ def gauge(self, stat, value, rate=1, delta=False): log.info(f"{stat}: {value}") def set(self, stat, value, rate=1): pass - def timer(self, stat, rate): + def timer(self, stat, rate=1): return MockTimer() try: From f795b5ec5728931838e5a951e0b9a19da45ef187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 4 Feb 2022 17:56:47 +0100 Subject: [PATCH 08/49] Add support for trimming strings longer than a certain length --- oonidata/oonidata/main.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 33f70d72..1375211f 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -1,6 +1,7 @@ import argparse import shutil from collections import namedtuple +from functools import singledispatch import tempfile import os import gzip @@ -24,6 +25,30 @@ log = logging.getLogger("oonidata") logging.basicConfig(level=logging.INFO) +# Taken from: +# https://github.com/Jigsaw-Code/net-analysis/blob/master/netanalysis/ooni/data/sync_measurements.py#L33 +@singledispatch +def trim_measurement(json_obj, max_string_size: int): + return json_obj + +@trim_measurement.register(dict) +def _(json_dict: dict, max_string_size: int): + keys_to_delete: List[str] = [] + for key, value in json_dict.items(): + if type(value) == str and len(value) > max_string_size: + keys_to_delete.append(key) + else: + trim_measurement(value, max_string_size) + for key in keys_to_delete: + del json_dict[key] + return json_dict + +@trim_measurement.register(list) +def _(json_list: list, max_string_size: int): + for item in json_list: + trim_measurement(item, max_string_size) + return json_list + def sync(args): s3cachedir = tempfile.TemporaryDirectory() conf = Config( @@ -66,6 +91,8 @@ def sync(args): continue if msmt["probe_cc"] != args.country: continue + if args.max_string_size: + msmt = trim_measurement(msmt, args.max_string_size) ujson.dump(msmt, out_file) out_file.write("\n") except Exception as e: @@ -98,7 +125,7 @@ def main(): parser_sync.add_argument("--last_date", type=_parse_date_flag, default=dt.date.today()) parser_sync.add_argument("--test_name", type=str, default='webconnectivity') - parser_sync.add_argument("--max_string_size", type=int, default=1000) + parser_sync.add_argument("--max_string_size", type=int) parser_sync.add_argument("--output_dir", type=pathlib.Path, required=True) parser_sync.add_argument("--debug", action="store_true") parser_sync.set_defaults(func=sync) From df42f2adcf3c86facf034b35a15d8d2d0c60c521 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 8 Feb 2022 16:59:48 +0100 Subject: [PATCH 09/49] Sync command now fetches data from s3 * Fetch from JSONL buckets instead of cans --- oonidata/oonidata/main.py | 23 +++--- oonidata/oonidata/s3feeder.py | 129 +++++++++++++++++++++++----------- 2 files changed, 97 insertions(+), 55 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 1375211f..e20d1168 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -16,7 +16,7 @@ import ujson from .s3feeder import stream_cans, load_multiple -from .s3feeder import list_cans_on_s3_for_a_day, list_minicans_on_s3_for_a_day, fetch_cans +from .s3feeder import list_jsonl_on_s3_for_a_day, fetch_cans from .s3feeder import create_s3_client, _calculate_etr Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) @@ -50,10 +50,11 @@ def _(json_list: list, max_string_size: int): return json_list def sync(args): + test_name = args.test_name.replace("_", "") s3cachedir = tempfile.TemporaryDirectory() conf = Config( ccs=args.country, - testnames=args.test_name, + testnames=test_name, keep_s3_cache=False, s3cachedir=pathlib.Path(s3cachedir.name) ) @@ -63,17 +64,13 @@ def sync(args): stop_day = args.last_date if args.last_date < today else today s3 = create_s3_client() while day < stop_day: - cans_fns = list_cans_on_s3_for_a_day(s3, day) - minicans_fns = list_minicans_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - cans_fns.extend(minicans_fns) + jsonl_fns = list_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - log.info(f"Downloading {len(cans_fns)} cans") - test_name = args.test_name.replace("_", "") - for cn, can_tuple in enumerate(cans_fns): + if len(jsonl_fns) > 0: + log.info(f"Downloading {day} {len(jsonl_fns)} jsonl.gz") + for cn, can_tuple in enumerate(jsonl_fns): s3fname, size = can_tuple basename = pathlib.Path(s3fname).name - if not basename.endswith(".jsonl.gz"): - basename = basename.rsplit('.', 2)[0] + '.jsonl.gz' dst_path = args.output_dir / args.country / test_name / f"{day:%Y-%m-%d}" / basename if dst_path.is_file(): continue @@ -83,14 +80,10 @@ def sync(args): with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: for can_f in fetch_cans(s3, conf, [can_tuple]): try: - etr = _calculate_etr(t0, time.time(), args.first_date, day, stop_day, cn, len(cans_fns)) + etr = _calculate_etr(t0, time.time(), args.first_date, day, stop_day, cn, len(jsonl_fns)) log.info(f"Estimated time remaining: {etr}") for msmt_tup in load_multiple(can_f.as_posix()): msmt = msmt_tup[1] - if msmt["test_name"].replace("_", "") != test_name: - continue - if msmt["probe_cc"] != args.country: - continue if args.max_string_size: msmt = trim_measurement(msmt, args.max_string_size) ujson.dump(msmt, out_file) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 54757f0c..ba0254bc 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -11,10 +11,12 @@ from datetime import date, timedelta from typing import Generator, Set +from collections import namedtuple from pathlib import Path import logging import os import time +import gzip import tarfile import lz4.frame as lz4frame # debdeps: python3-lz4 @@ -85,6 +87,14 @@ def load_multiple(fn: str) -> Generator[MsmtTup, None, None]: msmt_uid = trivial_id(msm) yield (None, msm, msmt_uid) + elif fn.endswith(".jsonl.gz"): + # New JSONL files + with gzip.open(fn) as f: + for line in f: + msm = ujson.loads(line) + msmt_uid = trivial_id(msm) + yield (None, msm, msmt_uid) + elif fn.endswith(".yaml.lz4"): # Legacy lz4 yaml files with lz4frame.open(fn) as f: @@ -145,17 +155,75 @@ def create_s3_client(): def list_cans_on_s3_for_a_day(s3, day: date): """List legacy cans.""" - prefix = f"{day}/" - r = s3.list_objects_v2(Bucket=CAN_BUCKET_NAME, Prefix="canned/" + prefix) + prefix = f"canned/{day}/" + paginator = s3.get_paginator("list_objects_v2") + files = [] + for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): + if ("Contents" in r) ^ (day <= date(2020, 10, 21)): + # The last day with cans is 2020-10-21 + log.warn("%d can files found!", len(r.get("Contents", []))) + fs = r.get("Contents", []) + for f in fs: + files.append((f["Key"], f["Size"])) + return files - if ("Contents" in r) ^ (day <= date(2020, 10, 21)): - # The last day with cans is 2020-10-21 - log.warn("%d can files found!", len(r.get("Contents", []))) - fs = r.get("Contents", []) - files = [(f["Key"], f["Size"]) for f in fs] - return files +FileEntry = namedtuple("FileEntry", ["timestamp", "country_code", "test_name", "filename", "size", "ext", "fullpath"]) + +def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: + paginator = s3.get_paginator("list_objects_v2") + for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): + for f in r.get("Contents", []): + fullpath = f["Key"] + filename = fullpath.split("/")[-1] + parts = filename.split("_") + test_name, _, _, ext = parts[2].split(".", 3) + file_entry = FileEntry( + timestamp=parts[0], + country_code=parts[1], + test_name=test_name, + filename=filename, + fullpath=fullpath, + size=f["Size"], + ext=ext, + ) + yield file_entry + +def _list_legacy_jsonl_on_s3_for_a_day(s3, day: date, country_code: str, test_name: str) -> list: + tstamp = day.strftime("%Y%m%d") + prefix = f"raw/{tstamp}/" + files = [] + for file_entry in iter_file_entries(s3, prefix): + if file_entry.ext != "jsonl.gz": + continue + + if country_code and file_entry.country_code != country_code: + continue + + if test_name and file_entry.test_name != test_name: + continue + + if file_entry.size > 0: + files.append((file_entry.fullpath, file_entry.size)) + return sorted(files) + +def list_jsonl_on_s3_for_a_day(s3, day: date, country_code: str, test_name: str) -> list: + if day >= date(2020, 10, 20): + return _list_legacy_jsonl_on_s3_for_a_day(s3, day, country_code, test_name) + + tstamp = day.strftime("%Y%m%d") + prefix = f"jsonl/{test_name}/{country_code}/{tstamp}/" + files = [] + for file_entry in iter_file_entries(s3, prefix): + if file_entry.ext != "jsonl.gz": + log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.fullpath}") + continue + + if file_entry.size > 0: + files.append((file_entry.fullpath, file_entry.size)) + + return sorted(files) def list_minicans_on_s3_for_a_day( s3, day: date, ccs: Set[str], testnames: Set[str] @@ -166,44 +234,25 @@ def list_minicans_on_s3_for_a_day( # s3cmd ls s3://ooni-data-eu-fra/raw/20210202 tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" - cont_token = None files = [] - # list_objects_v2 returns 1000 objects max and needs a token (!= None) - while True: - kw = {} if cont_token is None else dict(ContinuationToken=cont_token) - r = s3.list_objects_v2(Bucket=MC_BUCKET_NAME, Prefix=prefix, **kw) - - cont_token = r.get("NextContinuationToken", None) - if ("Contents" in r) ^ (day >= date(2020, 10, 20)): - # The first day with minicans is 2020-10-20 - log.warn("%d minican files found!", len(r.get("Contents", []))) - - for f in r.get("Contents", []): - if not f["Key"].endswith(".tar.gz"): - continue - - # Example: - # raw/20210910/02/CU/signal/2021091002_CU_signal.n0.0.tar.gz - fname = f["Key"] - try: - _raw, _date, _hour, cc, testname, _ = fname.split("/") - except Exception: - log.warn(f"Ignoring unexpected minican filename {fname}") + for file_entry in iter_file_entries(s3, prefix): + if not file_entry.ext != "tar.gz": + continue - if ccs and cc not in ccs: - continue + if ccs and file_entry.country_code not in ccs: + continue - if testnames and testname not in testnames: - continue + if testnames and file_entry.test_name not in testnames: + continue - if f["Size"] > 0: - files.append((fname, f["Size"])) + if file_entry.size > 0: + files.append((file_entry.fullpath, file_entry.size)) - if cont_token is None: - log.info(f"Found {len(files)} minican .tar.gz files") - return sorted(files) + if (day >= date(2020, 10, 20)) ^ len(files) > 0: + # The first day with minicans is 2020-10-20 + log.warn("%d minican files found!", len(files)) - assert False + return sorted(files) def log_download(s3fname, size) -> None: From 85dbab0c78dbb001306ab235a961a16dc4f3ab83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 10 Feb 2022 18:45:07 +0100 Subject: [PATCH 10/49] More refactoring in preparation for supporting jsonl in fastpath --- oonidata/oonidata/main.py | 10 ++--- oonidata/oonidata/s3feeder.py | 70 +++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index e20d1168..2199a7d5 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,7 +15,7 @@ import ujson -from .s3feeder import stream_cans, load_multiple +from .s3feeder import stream_cans, load_multiple, date_interval from .s3feeder import list_jsonl_on_s3_for_a_day, fetch_cans from .s3feeder import create_s3_client, _calculate_etr @@ -59,19 +59,15 @@ def sync(args): s3cachedir=pathlib.Path(s3cachedir.name) ) t0 = time.time() - day = args.first_date - today = dt.date.today() - stop_day = args.last_date if args.last_date < today else today s3 = create_s3_client() - while day < stop_day: + for day in date_interval(args.first_date, args.last_date): jsonl_fns = list_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - if len(jsonl_fns) > 0: log.info(f"Downloading {day} {len(jsonl_fns)} jsonl.gz") for cn, can_tuple in enumerate(jsonl_fns): s3fname, size = can_tuple basename = pathlib.Path(s3fname).name - dst_path = args.output_dir / args.country / test_name / f"{day:%Y-%m-%d}" / basename + dst_path = args.output_dir / test_name / args.country / f"{day:%Y-%m-%d}" / basename if dst_path.is_file(): continue os.makedirs(dst_path.parent, exist_ok=True) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index ba0254bc..57d9293a 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -325,7 +325,6 @@ def _cb(bytes_count): metrics.gauge("s3_download_speed_avg_Mbps", 0) - # TODO: merge with stream_daily_cans, add caching to the latter to be used # during functional tests # @metrics.timer("fetch_cans_for_a_day_with_cache") @@ -357,40 +356,63 @@ def _update_eta(t0, start_day, day, stop_day, can_num, can_tot_count): except: pass - -def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: - """Stream cans from S3""" +def date_interval(start_day: date, end_day: date): today = date.today() if not start_day or start_day >= today: - return - - log.info("Fetching older cans from S3") - t0 = time.time() + raise StopIteration day = start_day - s3 = create_s3_client() # the last day is not included stop_day = end_day if end_day < today else today while day < stop_day: + yield day + day += timedelta(days=1) + +def stream_measurements_from_files(s3, conf, filenames, cb_file_done=None) -> Generator[MsmtTup, None, None]: + for cn, can_f in enumerate(fetch_cans(s3, conf, filenames)): + try: + if cb_file_done: + cb_file_done() + # log.info("can %s ready", can_f.name) + for msmt_tup in load_multiple(can_f.as_posix()): + yield msmt_tup + except Exception as e: + log.error(str(e), exc_info=True) + + if not conf.keep_s3_cache: + try: + can_f.unlink() + except FileNotFoundError: + pass + +def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: + """Stream cans from S3""" + log.info("Fetching older cans from S3") + t0 = time.time() + s3 = create_s3_client() + lambda cb_file_done: _update_eta(t0, start_day, day, stop_day, cn, len(cans_fns)) + for day in date_interval(start_day, end_day): log.info("Processing day %s", day) cans_fns = list_cans_on_s3_for_a_day(s3, day) minicans_fns = list_minicans_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) cans_fns.extend(minicans_fns) - for cn, can_f in enumerate(fetch_cans(s3, conf, cans_fns)): - try: - _update_eta(t0, start_day, day, stop_day, cn, len(cans_fns)) - # log.info("can %s ready", can_f.name) - for msmt_tup in load_multiple(can_f.as_posix()): - yield msmt_tup - except Exception as e: - log.error(str(e), exc_info=True) - - if not conf.keep_s3_cache: - try: - can_f.unlink() - except FileNotFoundError: - pass + for msmt_tup in stream_measurements_from_files(s3, conf, cans_fns, cb_file_done=cb_file_done): + yield msmt_tup - day += timedelta(days=1) + if end_day: + log.info(f"Reached {end_day}, streaming cans from S3 finished") + return + +def stream_jsonl(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: + """Stream jsonl from S3""" + log.info("Fetching older cans from S3") + t0 = time.time() + s3 = create_s3_client() + lambda cb_file_done: _update_eta(t0, start_day, day, stop_day, cn, len(cans_fns)) + for day in date_interval(start_day, end_day): + log.info("Processing day %s", day) + jsonl_fns = list_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) + for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns, cb_file_done=cb_file_done): + yield msmt_tup if end_day: log.info(f"Reached {end_day}, streaming cans from S3 finished") From 72fc33ddfd08ee8f21b4f028261a0085b0b362ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 11 Feb 2022 13:16:38 +0100 Subject: [PATCH 11/49] Refactoring of the jsonl related functionality * Minimise duplication between can fetch and jsonl fetch code * Improve efficiency in how jsonl files are listed --- oonidata/oonidata/main.py | 66 +++++++++++++--------------------- oonidata/oonidata/s3feeder.py | 67 +++++++++++++++++++++-------------- 2 files changed, 66 insertions(+), 67 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 2199a7d5..e0ed9d1b 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,9 +15,8 @@ import ujson -from .s3feeder import stream_cans, load_multiple, date_interval -from .s3feeder import list_jsonl_on_s3_for_a_day, fetch_cans -from .s3feeder import create_s3_client, _calculate_etr +from .s3feeder import create_s3_client +from .s3feeder import jsonl_in_range, stream_measurements_from_files Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) FileEntry = namedtuple("FileEntry", ["country", "test_name", "date", "basename"]) @@ -53,50 +52,34 @@ def sync(args): test_name = args.test_name.replace("_", "") s3cachedir = tempfile.TemporaryDirectory() conf = Config( - ccs=args.country, - testnames=test_name, + ccs=[args.country], + testnames=[test_name], keep_s3_cache=False, s3cachedir=pathlib.Path(s3cachedir.name) ) t0 = time.time() s3 = create_s3_client() - for day in date_interval(args.first_date, args.last_date): - jsonl_fns = list_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - if len(jsonl_fns) > 0: - log.info(f"Downloading {day} {len(jsonl_fns)} jsonl.gz") - for cn, can_tuple in enumerate(jsonl_fns): - s3fname, size = can_tuple - basename = pathlib.Path(s3fname).name - dst_path = args.output_dir / test_name / args.country / f"{day:%Y-%m-%d}" / basename - if dst_path.is_file(): - continue - os.makedirs(dst_path.parent, exist_ok=True) - temp_path = dst_path.with_name(f"{dst_path.name}.tmp") - try: - with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: - for can_f in fetch_cans(s3, conf, [can_tuple]): - try: - etr = _calculate_etr(t0, time.time(), args.first_date, day, stop_day, cn, len(jsonl_fns)) - log.info(f"Estimated time remaining: {etr}") - for msmt_tup in load_multiple(can_f.as_posix()): - msmt = msmt_tup[1] - if args.max_string_size: - msmt = trim_measurement(msmt, args.max_string_size) - ujson.dump(msmt, out_file) - out_file.write("\n") - except Exception as e: - log.error(str(e), exc_info=True) - try: - can_f.unlink() - except FileNotFoundError: - pass - temp_path.replace(dst_path) - except: - temp_path.unlink() - s3cachedir.cleanup() - raise + for file_entry in jsonl_in_range(s3, conf, args.first_date, args.last_date): + dst_path = args.output_dir / file_entry.test_name / file_entry.country_code / f"{file_entry.timestamp:%Y-%m-%d}" / file_entry.filename + if dst_path.is_file(): + continue + os.makedirs(dst_path.parent, exist_ok=True) + temp_path = dst_path.with_name(f"{dst_path.name}.tmp") + try: + with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: + jsonl_fns = [(file_entry.fullpath, file_entry.size)] + for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns): + msmt = msmt_tup[1] + if args.max_string_size: + msmt = trim_measurement(msmt, args.max_string_size) + ujson.dump(msmt, out_file) + out_file.write("\n") + temp_path.replace(dst_path) + except: + temp_path.unlink() + s3cachedir.cleanup() + raise - day += dt.timedelta(days=1) s3cachedir.cleanup() def _parse_date_flag(date_str: str) -> dt.date: @@ -104,6 +87,7 @@ def _parse_date_flag(date_str: str) -> dt.date: def main(): parser = argparse.ArgumentParser("OONI Data tools") + parser.set_defaults(func=lambda r: parser.print_usage()) subparsers = parser.add_subparsers() diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 57d9293a..e69ecdd4 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -9,7 +9,7 @@ """ -from datetime import date, timedelta +from datetime import date, timedelta, datetime from typing import Generator, Set from collections import namedtuple from pathlib import Path @@ -179,7 +179,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( - timestamp=parts[0], + timestamp=datetime.strptime(parts[0], "%Y%m%d%H"), country_code=parts[1], test_name=test_name, filename=filename, @@ -189,7 +189,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: ) yield file_entry -def _list_legacy_jsonl_on_s3_for_a_day(s3, day: date, country_code: str, test_name: str) -> list: +def _legacy_jsonl_on_s3_for_a_day(s3, day: date, country_codes: Set[str], test_names: Set[str]) -> list: tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" files = [] @@ -197,33 +197,51 @@ def _list_legacy_jsonl_on_s3_for_a_day(s3, day: date, country_code: str, test_na if file_entry.ext != "jsonl.gz": continue - if country_code and file_entry.country_code != country_code: + if len(country_codes) > 0 and file_entry.country_code not in country_codes: continue - if test_name and file_entry.test_name != test_name: + if len(test_names) > 0 and file_entry.test_name not in test_names: continue if file_entry.size > 0: - files.append((file_entry.fullpath, file_entry.size)) + yield file_entry - return sorted(files) -def list_jsonl_on_s3_for_a_day(s3, day: date, country_code: str, test_name: str) -> list: - if day >= date(2020, 10, 20): - return _list_legacy_jsonl_on_s3_for_a_day(s3, day, country_code, test_name) +def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEntry, None, None]: + # List all the jsonl file entries in the old bucket format + for day in date_interval(date(2020, 10, 20), end_day): + for fe in _legacy_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames): + yield fe + + prefixes = ["jsonl/"] + # We have both a testname list and a country code list, we can efficiently + # pre-filter based on prefix + if len(conf.testnames) > 0 and len(conf.ccs) > 0: + c = itertools.product(conf.testnames, conf.ccs, date_interval(start_day, date(2020, 10, 21))) + prefixes = [f"jsonl/{tn}/{cc}/{ts}" for cc, tn, ts in c] + + elif len(conf.testnames): + prefixes = [f"jsonl/{tn}/" for tn in conf.testnames] + + # In other cases, we are going to have to list all the bucket and do + # filtering based on filepath + for p in prefixes: + for file_entry in iter_file_entries(s3, p): + if file_entry.ext != "jsonl.gz": + log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.fullpath}") + continue - tstamp = day.strftime("%Y%m%d") - prefix = f"jsonl/{test_name}/{country_code}/{tstamp}/" - files = [] - for file_entry in iter_file_entries(s3, prefix): - if file_entry.ext != "jsonl.gz": - log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.fullpath}") - continue + if len(conf.ccs) > 0 and file_entry.country_code not in conf.ccs: + continue - if file_entry.size > 0: - files.append((file_entry.fullpath, file_entry.size)) + if len(conf.testnames) > 0 and file_entry.test_name not in conf.testnames: + continue - return sorted(files) + if file_entry.timestamp < start_day or file_entry.timestamp >= end_day: + continue + + if file_entry.size > 0: + yield file_entry def list_minicans_on_s3_for_a_day( s3, day: date, ccs: Set[str], testnames: Set[str] @@ -405,13 +423,10 @@ def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None def stream_jsonl(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: """Stream jsonl from S3""" log.info("Fetching older cans from S3") - t0 = time.time() s3 = create_s3_client() - lambda cb_file_done: _update_eta(t0, start_day, day, stop_day, cn, len(cans_fns)) - for day in date_interval(start_day, end_day): - log.info("Processing day %s", day) - jsonl_fns = list_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns, cb_file_done=cb_file_done): + for file_entry in jsonl_in_range(s3, conf, start_day, end_day): + jsonl_fns = [(file_entry.fullpath, file_entry.size)] + for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns): yield msmt_tup if end_day: From f2e509419e7282d3ea8de3818f9a629424b48b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 11 Feb 2022 13:20:14 +0100 Subject: [PATCH 12/49] Fix bug in logic for determining ranges --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index e69ecdd4..b531fa75 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -209,7 +209,7 @@ def _legacy_jsonl_on_s3_for_a_day(s3, day: date, country_codes: Set[str], test_n def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEntry, None, None]: # List all the jsonl file entries in the old bucket format - for day in date_interval(date(2020, 10, 20), end_day): + for day in date_interval(max(date(2020, 10, 20), start_day), end_day): for fe in _legacy_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames): yield fe From 9eda08471cfad899d87faa0c19af13c7230d0d35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 15:30:23 +0100 Subject: [PATCH 13/49] Improvements to oonidata CLI * Add support for passing lists of test_names and country codes * Use - instead of _ for CLI flags --- oonidata/oonidata/main.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index e0ed9d1b..2b6a345f 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -49,17 +49,24 @@ def _(json_list: list, max_string_size: int): return json_list def sync(args): - test_name = args.test_name.replace("_", "") + testnames = [] + ccs = [] + + if args.test_name: + testnames = list(map(lambda x: x.replace("_", ""), args.test_name)) + if args.country: + ccs = args.country + s3cachedir = tempfile.TemporaryDirectory() conf = Config( - ccs=[args.country], - testnames=[test_name], + ccs=ccs, + testnames=testnames, keep_s3_cache=False, s3cachedir=pathlib.Path(s3cachedir.name) ) t0 = time.time() s3 = create_s3_client() - for file_entry in jsonl_in_range(s3, conf, args.first_date, args.last_date): + for file_entry in jsonl_in_range(s3, conf, args.since, args.until): dst_path = args.output_dir / file_entry.test_name / file_entry.country_code / f"{file_entry.timestamp:%Y-%m-%d}" / file_entry.filename if dst_path.is_file(): continue @@ -92,14 +99,14 @@ def main(): subparsers = parser.add_subparsers() parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") - parser_sync.add_argument("--country", type=str, required=True) - parser_sync.add_argument("--first_date", type=_parse_date_flag, + parser_sync.add_argument("--country", type=str, nargs="*") + parser_sync.add_argument("--since", type=_parse_date_flag, default=dt.date.today() - dt.timedelta(days=14)) - parser_sync.add_argument("--last_date", type=_parse_date_flag, + parser_sync.add_argument("--until", type=_parse_date_flag, default=dt.date.today()) - parser_sync.add_argument("--test_name", type=str, default='webconnectivity') - parser_sync.add_argument("--max_string_size", type=int) - parser_sync.add_argument("--output_dir", type=pathlib.Path, required=True) + parser_sync.add_argument("--test-name", nargs="*") + parser_sync.add_argument("--max-string-size", type=int) + parser_sync.add_argument("--output-dir", type=pathlib.Path, required=True) parser_sync.add_argument("--debug", action="store_true") parser_sync.set_defaults(func=sync) From e078658bc998197e7580ab742dad6a2726844f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 16:25:32 +0100 Subject: [PATCH 14/49] TMP commit --- oonidata/oonidata/s3feeder.py | 45 ++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index b531fa75..c825db47 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -10,7 +10,7 @@ """ from datetime import date, timedelta, datetime -from typing import Generator, Set +from typing import Generator, Set, NamedTuple, Any from collections import namedtuple from pathlib import Path import logging @@ -168,14 +168,24 @@ def list_cans_on_s3_for_a_day(s3, day: date): return files -FileEntry = namedtuple("FileEntry", ["timestamp", "country_code", "test_name", "filename", "size", "ext", "fullpath"]) +class FileEntry(NamedTuple): + timestamp: Any + country_code: str + test_name: str + filename: str + size: int + ext: str + s3path: str + bucket_name: str + + def output_path(self, dst_dir: Path): + return dst_dir / self.test_name / self.country_code / f"self.timestamp:%Y-%m-%d" / file_entry.filename def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: paginator = s3.get_paginator("list_objects_v2") for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): for f in r.get("Contents", []): - fullpath = f["Key"] - filename = fullpath.split("/")[-1] + s3path = f["Key"].split("/")[-1] parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( @@ -183,9 +193,10 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: country_code=parts[1], test_name=test_name, filename=filename, - fullpath=fullpath, + s3path=s3path, size=f["Size"], ext=ext, + bucket_name=MC_BUCKET_NAME ) yield file_entry @@ -228,7 +239,7 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn for p in prefixes: for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": - log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.fullpath}") + log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.s3path}") continue if len(conf.ccs) > 0 and file_entry.country_code not in conf.ccs: @@ -264,7 +275,7 @@ def list_minicans_on_s3_for_a_day( continue if file_entry.size > 0: - files.append((file_entry.fullpath, file_entry.size)) + files.append((file_entry.s3path, file_entry.size)) if (day >= date(2020, 10, 20)) ^ len(files) > 0: # The first day with minicans is 2020-10-20 @@ -402,6 +413,24 @@ def stream_measurements_from_files(s3, conf, filenames, cb_file_done=None) -> Ge except FileNotFoundError: pass +def download_measurement_container(s3, conf, file_entry: FileEntry): + diskf = file_entry.output_path(conf.s3cachedir) + if diskf.exists() and file_entry.size == diskf.stat().st_size: + diskf.touch(exist_ok=True) + return diskf + + diskf.parent.mkdir(parents=True, exist_ok=True) + tmpf = diskf.with_suffix(".s3tmp") + with tmpf.open("wb") as f: + s3.download_fileobj(file_entry.bucket_name, file_entry.s3path, f, Callback=_cb) + f.flush() + os.fsync(f.fileno()) + metrics.gauge("fetching", 0) + tmpf.rename(diskf) + assert file_entry.size == diskf.stat().st_size + return diskf + + def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: """Stream cans from S3""" log.info("Fetching older cans from S3") @@ -425,7 +454,7 @@ def stream_jsonl(conf, start_day: date, end_day: date) -> Generator[MsmtTup, Non log.info("Fetching older cans from S3") s3 = create_s3_client() for file_entry in jsonl_in_range(s3, conf, start_day, end_day): - jsonl_fns = [(file_entry.fullpath, file_entry.size)] + jsonl_fns = [(file_entry.s3path, file_entry.size)] for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns): yield msmt_tup From ed605ccdd6f07af6794860283a41a1ea11b3b61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 19:30:23 +0100 Subject: [PATCH 15/49] Refactor all code related to can and jsonl listing --- oonidata/oonidata/s3feeder.py | 247 +++++++++++++++------------------- 1 file changed, 110 insertions(+), 137 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index c825db47..e1d0a9b0 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -153,24 +153,52 @@ def create_s3_client(): return boto3.client("s3", config=botoConfig(signature_version=botoSigUNSIGNED)) -def list_cans_on_s3_for_a_day(s3, day: date): +def list_cans_on_s3_for_a_day(s3, day: date) -> list: + return list(map(lambda fe: (fe.s3path, fe.size), iter_cans_on_s3_for_a_day(s3, day))) + +def iter_cans_on_s3_for_a_day(s3, day: date): """List legacy cans.""" prefix = f"canned/{day}/" paginator = s3.get_paginator("list_objects_v2") files = [] - for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): + for r in paginator.paginate(Bucket=CAN_BUCKET_NAME , Prefix=prefix): if ("Contents" in r) ^ (day <= date(2020, 10, 21)): # The last day with cans is 2020-10-21 log.warn("%d can files found!", len(r.get("Contents", []))) - fs = r.get("Contents", []) - for f in fs: - files.append((f["Key"], f["Size"])) - return files + for f in r.get("Contents", []): + s3path = f["Key"] + filename = s3path.split("/")[-1] + country_code = None + ext = None + if filename.endswith(".tar.lz4"): + test_name = filename.split(".")[0].replace("_", "") + ext = "tar.lz4" + elif filename.endswith(".json.lz4"): + parts = filename.split("-") + country_code = parts[1] + test_name = parts[3].replace("_", "") + ext = "json.lz4" + else: + if filename != "index.json.gz": + log.warn(f"found an unexpected filename {filename}") + continue + + file_entry = FileEntry( + timestamp=day, + country_code=country_code, + test_name=test_name, + filename=filename, + size=f["Size"], + ext=ext, + s3path=s3path, + bucket_name=MC_BUCKET_NAME + ) + yield file_entry class FileEntry(NamedTuple): timestamp: Any - country_code: str + country_code: Any test_name: str filename: str size: int @@ -178,14 +206,33 @@ class FileEntry(NamedTuple): s3path: str bucket_name: str - def output_path(self, dst_dir: Path): + def output_path(self, dst_dir: Path) -> Path: return dst_dir / self.test_name / self.country_code / f"self.timestamp:%Y-%m-%d" / file_entry.filename + def matches_filter(self, ccs: Set[str], testnames: Set[str]) -> bool: + if self.country_code and len(ccs) > 0 and self.country_code not in ccs: + return False + + if self.test_name and len(testnames) > 0 and self.test_name not in testnames: + return False + + return True + + def log_download(self) -> None: + s = self.size / 1024 / 1024 + d = "M" + if s < 1: + s = self.size / 1024 + d = "K" + log.info(f"Downloading can {self.s3path} size {s:.1f} {d}B") + + def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: paginator = s3.get_paginator("list_objects_v2") for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): for f in r.get("Contents", []): - s3path = f["Key"].split("/")[-1] + s3path = f["Key"] + filename = s3path.split("/")[-1] parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( @@ -208,10 +255,7 @@ def _legacy_jsonl_on_s3_for_a_day(s3, day: date, country_codes: Set[str], test_n if file_entry.ext != "jsonl.gz": continue - if len(country_codes) > 0 and file_entry.country_code not in country_codes: - continue - - if len(test_names) > 0 and file_entry.test_name not in test_names: + if not file_entry.matches_filter(country_codes, test_names): continue if file_entry.size > 0: @@ -242,10 +286,7 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.s3path}") continue - if len(conf.ccs) > 0 and file_entry.country_code not in conf.ccs: - continue - - if len(conf.testnames) > 0 and file_entry.test_name not in conf.testnames: + if not file_entry.matches_filter(conf.ccs, conf.testnames): continue if file_entry.timestamp < start_day or file_entry.timestamp >= end_day: @@ -254,9 +295,14 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn if file_entry.size > 0: yield file_entry -def list_minicans_on_s3_for_a_day( - s3, day: date, ccs: Set[str], testnames: Set[str] -) -> list: +def list_minicans_on_s3_for_a_day(s3, day: date, ccs: Set[str], testnames: Set[str]) -> list: + return list( + map(lambda fe: (fe.s3path, fe.size)), + filter(lambda fe: fe.matches_filter(ccs, testnames), + iter_minicans_on_s3_for_a_day(s3, day)) + ) + +def iter_minicans_on_s3_for_a_day(s3, day: date) -> Generator[FileEntry, None, None]: """List minicans. Filter them by CCs and testnames Testnames are without underscores. """ @@ -267,101 +313,12 @@ def list_minicans_on_s3_for_a_day( for file_entry in iter_file_entries(s3, prefix): if not file_entry.ext != "tar.gz": continue - - if ccs and file_entry.country_code not in ccs: - continue - - if testnames and file_entry.test_name not in testnames: - continue - - if file_entry.size > 0: - files.append((file_entry.s3path, file_entry.size)) + yield file_entry if (day >= date(2020, 10, 20)) ^ len(files) > 0: # The first day with minicans is 2020-10-20 log.warn("%d minican files found!", len(files)) - return sorted(files) - - -def log_download(s3fname, size) -> None: - s = size / 1024 / 1024 - d = "M" - if s < 1: - s = size / 1024 - d = "K" - log.info(f"Downloading can {s3fname} size {s:.1f} {d}B") - - -@metrics.timer("fetch_cans") -def fetch_cans(s3, conf, files) -> Generator[Path, None, None]: - """ - Download cans to a local directory - fnames = [("2013-09-12/20130912T150305Z-MD-AS1547-http_", size), ... ] - yield each can file Path - """ - # fn: can filename without path - # diskf: File in the s3cachedir directory - cans = set() # (s3fname, filename on disk, size, download required) - for s3fname, size in files: - diskf = conf.s3cachedir / s3fname.split("/", 1)[1] - if diskf.exists() and size == diskf.stat().st_size: - metrics.incr("cache_hit") - diskf.touch(exist_ok=True) - cans.add((s3fname, diskf, size, False)) - else: - metrics.incr("cache_miss") - cans.add((s3fname, diskf, size, True)) - - def _cb(bytes_count): - if _cb.start_time is None: - _cb.start_time = time.time() - _cb.count = bytes_count - return - _cb.count += bytes_count - _cb.total_count += bytes_count - metrics.gauge("s3_download_percentage", _cb.total_count / _cb.total_size * 100) - try: - speed = _cb.count / 131_072 / (time.time() - _cb.start_time) - metrics.gauge("s3_download_speed_avg_Mbps", speed) - except ZeroDivisionError: - pass - - cans = sorted(cans) - _cb.total_size = sum(t[2] for t in cans if t[3]) - _cb.total_count = 0 - - for s3fname, diskf, size, dload_required in cans: - if not dload_required: - yield diskf # already in local cache - continue - - # TODO: handle missing file - log_download(s3fname, size) - diskf.parent.mkdir(parents=True, exist_ok=True) - tmpf = diskf.with_suffix(".s3tmp") - metrics.gauge("fetching", 1) - _cb.start_time = None - with tmpf.open("wb") as f: - bucket_name = CAN_BUCKET_NAME if "canned/" in s3fname else MC_BUCKET_NAME - s3.download_fileobj(bucket_name, s3fname, f, Callback=_cb) - f.flush() - os.fsync(f.fileno()) - metrics.gauge("fetching", 0) - tmpf.rename(diskf) - assert size == diskf.stat().st_size - yield diskf - - metrics.gauge("s3_download_speed_avg_Mbps", 0) - -# TODO: merge with stream_daily_cans, add caching to the latter to be used -# during functional tests -# @metrics.timer("fetch_cans_for_a_day_with_cache") -# def fetch_cans_for_a_day_with_cache(conf, day): -# s3 = create_s3_client() -# fns = list_cans_on_s3_for_a_day(s3, day) -# list(fetch_cans(s3, conf, fns)) - def _calculate_etr(t0, now, start_day, day, stop_day, can_num, can_tot_count) -> int: """Estimate total runtime in seconds. @@ -396,28 +353,32 @@ def date_interval(start_day: date, end_day: date): yield day day += timedelta(days=1) -def stream_measurements_from_files(s3, conf, filenames, cb_file_done=None) -> Generator[MsmtTup, None, None]: - for cn, can_f in enumerate(fetch_cans(s3, conf, filenames)): - try: - if cb_file_done: - cb_file_done() - # log.info("can %s ready", can_f.name) - for msmt_tup in load_multiple(can_f.as_posix()): - yield msmt_tup - except Exception as e: - log.error(str(e), exc_info=True) - - if not conf.keep_s3_cache: - try: - can_f.unlink() - except FileNotFoundError: - pass - +@metrics.timer("download_measurement_container") def download_measurement_container(s3, conf, file_entry: FileEntry): diskf = file_entry.output_path(conf.s3cachedir) if diskf.exists() and file_entry.size == diskf.stat().st_size: + metrics.incr("cache_hit") diskf.touch(exist_ok=True) return diskf + metrics.incr("cache_miss") + + file_entry.log_download() + def _cb(bytes_count): + if _cb.start_time is None: + _cb.start_time = time.time() + _cb.count = bytes_count + return + _cb.count += bytes_count + _cb.total_count += bytes_count + metrics.gauge("s3_download_percentage", _cb.total_count / _cb.total_size * 100) + try: + speed = _cb.count / 131_072 / (time.time() - _cb.start_time) + metrics.gauge("s3_download_speed_avg_Mbps", speed) + except ZeroDivisionError: + pass + + _cb.total_size = file_entry.size + _cb.total_count = 0 diskf.parent.mkdir(parents=True, exist_ok=True) tmpf = diskf.with_suffix(".s3tmp") @@ -428,22 +389,37 @@ def download_measurement_container(s3, conf, file_entry: FileEntry): metrics.gauge("fetching", 0) tmpf.rename(diskf) assert file_entry.size == diskf.stat().st_size + metrics.gauge("s3_download_speed_avg_Mbps", 0) return diskf +def stream_measurements(s3, conf, file_entries: Generator[FileEntry, None, None]) -> Generator[MsmtTup, None, None]: + for fe in file_entries: + if not fe.matches_filter(conf.ccs, conf.testnames): + continue + mc = download_measurement_container(s3, conf, fe) + try: + yield from load_multiple(mc.as_posix()) + except Exception as e: + log.error(str(e), exc_info=True) + if not conf.keep_s3_cache: + try: + mc.unlink() + except FileNotFoundError: + pass def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: """Stream cans from S3""" log.info("Fetching older cans from S3") t0 = time.time() s3 = create_s3_client() - lambda cb_file_done: _update_eta(t0, start_day, day, stop_day, cn, len(cans_fns)) for day in date_interval(start_day, end_day): log.info("Processing day %s", day) - cans_fns = list_cans_on_s3_for_a_day(s3, day) - minicans_fns = list_minicans_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames) - cans_fns.extend(minicans_fns) - for msmt_tup in stream_measurements_from_files(s3, conf, cans_fns, cb_file_done=cb_file_done): - yield msmt_tup + + can_file_entries = itertools.chain( + iter_cans_on_s3_for_a_day(s3, day), + iter_minicans_on_s3_for_a_day(s3, day) + ) + yield from stream_measurements(s3, conf, can_file_entries) if end_day: log.info(f"Reached {end_day}, streaming cans from S3 finished") @@ -453,10 +429,7 @@ def stream_jsonl(conf, start_day: date, end_day: date) -> Generator[MsmtTup, Non """Stream jsonl from S3""" log.info("Fetching older cans from S3") s3 = create_s3_client() - for file_entry in jsonl_in_range(s3, conf, start_day, end_day): - jsonl_fns = [(file_entry.s3path, file_entry.size)] - for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns): - yield msmt_tup + yield from stream_measurements(s3, conf, jsonl_in_range(s3, conf, start_day, end_day)) if end_day: log.info(f"Reached {end_day}, streaming cans from S3 finished") From 8ece3f5590076fb751c34e7c8736cb31897a143b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 19:33:36 +0100 Subject: [PATCH 16/49] Reflow using black --- oonidata/oonidata/s3feeder.py | 76 ++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 19 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index e1d0a9b0..0b1ab867 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -29,7 +29,7 @@ from botocore.config import Config as botoConfig from .metrics import setup_metrics -from .mytypes import MsmtTup # msmt bytes, msmt dict, uid +from .mytypes import MsmtTup # msmt bytes, msmt dict, uid from .normalize import iter_yaml_msmt_normalized from .utils import trivial_id @@ -154,14 +154,17 @@ def create_s3_client(): def list_cans_on_s3_for_a_day(s3, day: date) -> list: - return list(map(lambda fe: (fe.s3path, fe.size), iter_cans_on_s3_for_a_day(s3, day))) + return list( + map(lambda fe: (fe.s3path, fe.size), iter_cans_on_s3_for_a_day(s3, day)) + ) + def iter_cans_on_s3_for_a_day(s3, day: date): """List legacy cans.""" prefix = f"canned/{day}/" paginator = s3.get_paginator("list_objects_v2") files = [] - for r in paginator.paginate(Bucket=CAN_BUCKET_NAME , Prefix=prefix): + for r in paginator.paginate(Bucket=CAN_BUCKET_NAME, Prefix=prefix): if ("Contents" in r) ^ (day <= date(2020, 10, 21)): # The last day with cans is 2020-10-21 log.warn("%d can files found!", len(r.get("Contents", []))) @@ -192,10 +195,11 @@ def iter_cans_on_s3_for_a_day(s3, day: date): size=f["Size"], ext=ext, s3path=s3path, - bucket_name=MC_BUCKET_NAME + bucket_name=MC_BUCKET_NAME, ) yield file_entry + class FileEntry(NamedTuple): timestamp: Any country_code: Any @@ -207,7 +211,13 @@ class FileEntry(NamedTuple): bucket_name: str def output_path(self, dst_dir: Path) -> Path: - return dst_dir / self.test_name / self.country_code / f"self.timestamp:%Y-%m-%d" / file_entry.filename + return ( + dst_dir + / self.test_name + / self.country_code + / f"self.timestamp:%Y-%m-%d" + / file_entry.filename + ) def matches_filter(self, ccs: Set[str], testnames: Set[str]) -> bool: if self.country_code and len(ccs) > 0 and self.country_code not in ccs: @@ -243,11 +253,14 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: s3path=s3path, size=f["Size"], ext=ext, - bucket_name=MC_BUCKET_NAME + bucket_name=MC_BUCKET_NAME, ) yield file_entry -def _legacy_jsonl_on_s3_for_a_day(s3, day: date, country_codes: Set[str], test_names: Set[str]) -> list: + +def _legacy_jsonl_on_s3_for_a_day( + s3, day: date, country_codes: Set[str], test_names: Set[str] +) -> list: tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" files = [] @@ -262,7 +275,9 @@ def _legacy_jsonl_on_s3_for_a_day(s3, day: date, country_codes: Set[str], test_n yield file_entry -def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEntry, None, None]: +def jsonl_in_range( + s3, conf, start_day: date, end_day: date +) -> Generator[FileEntry, None, None]: # List all the jsonl file entries in the old bucket format for day in date_interval(max(date(2020, 10, 20), start_day), end_day): for fe in _legacy_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames): @@ -272,7 +287,9 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn # We have both a testname list and a country code list, we can efficiently # pre-filter based on prefix if len(conf.testnames) > 0 and len(conf.ccs) > 0: - c = itertools.product(conf.testnames, conf.ccs, date_interval(start_day, date(2020, 10, 21))) + c = itertools.product( + conf.testnames, conf.ccs, date_interval(start_day, date(2020, 10, 21)) + ) prefixes = [f"jsonl/{tn}/{cc}/{ts}" for cc, tn, ts in c] elif len(conf.testnames): @@ -283,7 +300,9 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn for p in prefixes: for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": - log.warn(f"Found non jsonl.gz file in jsonl prefix: {file_entry.s3path}") + log.warn( + f"Found non jsonl.gz file in jsonl prefix: {file_entry.s3path}" + ) continue if not file_entry.matches_filter(conf.ccs, conf.testnames): @@ -295,13 +314,21 @@ def jsonl_in_range(s3, conf, start_day: date, end_day: date) -> Generator[FileEn if file_entry.size > 0: yield file_entry -def list_minicans_on_s3_for_a_day(s3, day: date, ccs: Set[str], testnames: Set[str]) -> list: + +def list_minicans_on_s3_for_a_day( + s3, day: date, ccs: Set[str], testnames: Set[str] +) -> list: return list( - map(lambda fe: (fe.s3path, fe.size)), - filter(lambda fe: fe.matches_filter(ccs, testnames), - iter_minicans_on_s3_for_a_day(s3, day)) + map( + lambda fe: (fe.s3path, fe.size), + filter( + lambda fe: fe.matches_filter(ccs, testnames), + iter_minicans_on_s3_for_a_day(s3, day), + ), + ) ) + def iter_minicans_on_s3_for_a_day(s3, day: date) -> Generator[FileEntry, None, None]: """List minicans. Filter them by CCs and testnames Testnames are without underscores. @@ -342,6 +369,7 @@ def _update_eta(t0, start_day, day, stop_day, can_num, can_tot_count): except: pass + def date_interval(start_day: date, end_day: date): today = date.today() if not start_day or start_day >= today: @@ -353,6 +381,7 @@ def date_interval(start_day: date, end_day: date): yield day day += timedelta(days=1) + @metrics.timer("download_measurement_container") def download_measurement_container(s3, conf, file_entry: FileEntry): diskf = file_entry.output_path(conf.s3cachedir) @@ -363,6 +392,7 @@ def download_measurement_container(s3, conf, file_entry: FileEntry): metrics.incr("cache_miss") file_entry.log_download() + def _cb(bytes_count): if _cb.start_time is None: _cb.start_time = time.time() @@ -392,7 +422,10 @@ def _cb(bytes_count): metrics.gauge("s3_download_speed_avg_Mbps", 0) return diskf -def stream_measurements(s3, conf, file_entries: Generator[FileEntry, None, None]) -> Generator[MsmtTup, None, None]: + +def stream_measurements( + s3, conf, file_entries: Generator[FileEntry, None, None] +) -> Generator[MsmtTup, None, None]: for fe in file_entries: if not fe.matches_filter(conf.ccs, conf.testnames): continue @@ -407,6 +440,7 @@ def stream_measurements(s3, conf, file_entries: Generator[FileEntry, None, None] except FileNotFoundError: pass + def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: """Stream cans from S3""" log.info("Fetching older cans from S3") @@ -416,8 +450,7 @@ def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None log.info("Processing day %s", day) can_file_entries = itertools.chain( - iter_cans_on_s3_for_a_day(s3, day), - iter_minicans_on_s3_for_a_day(s3, day) + iter_cans_on_s3_for_a_day(s3, day), iter_minicans_on_s3_for_a_day(s3, day) ) yield from stream_measurements(s3, conf, can_file_entries) @@ -425,11 +458,16 @@ def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None log.info(f"Reached {end_day}, streaming cans from S3 finished") return -def stream_jsonl(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: + +def stream_jsonl( + conf, start_day: date, end_day: date +) -> Generator[MsmtTup, None, None]: """Stream jsonl from S3""" log.info("Fetching older cans from S3") s3 = create_s3_client() - yield from stream_measurements(s3, conf, jsonl_in_range(s3, conf, start_day, end_day)) + yield from stream_measurements( + s3, conf, jsonl_in_range(s3, conf, start_day, end_day) + ) if end_day: log.info(f"Reached {end_day}, streaming cans from S3 finished") From f779ad6e608dd06f7f22fa4c2f5798a785fd31fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 19:54:02 +0100 Subject: [PATCH 17/49] Adjust oonidata CLI based on changes in s3feeder --- oonidata/oonidata/main.py | 50 +++++++++++++++-------------------- oonidata/oonidata/s3feeder.py | 9 ++++--- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 2b6a345f..862cff2a 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,11 +15,10 @@ import ujson -from .s3feeder import create_s3_client -from .s3feeder import jsonl_in_range, stream_measurements_from_files +from .s3feeder import create_s3_client, FileEntry, download_measurement_container +from .s3feeder import jsonl_in_range Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) -FileEntry = namedtuple("FileEntry", ["country", "test_name", "date", "basename"]) log = logging.getLogger("oonidata") logging.basicConfig(level=logging.INFO) @@ -48,46 +47,41 @@ def _(json_list: list, max_string_size: int): trim_measurement(item, max_string_size) return json_list +def trim_container(conf, fe: FileEntry, max_string_size: int): + mc = fe.output_path(conf.s3cachedir) + temp_path = diskf.with_suffix(".tmp") + try: + with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: + for msmt in load_multiple(mc.as_posix()): + msmt = trim_measurement(msmt, args.max_string_size) + ujson.dump(msmt, out_file) + out_file.write("\n") + temp_path.replace(mc) + except: + temp_path.unlink() + raise + def sync(args): testnames = [] ccs = [] - if args.test_name: testnames = list(map(lambda x: x.replace("_", ""), args.test_name)) if args.country: ccs = args.country - - s3cachedir = tempfile.TemporaryDirectory() conf = Config( ccs=ccs, testnames=testnames, - keep_s3_cache=False, - s3cachedir=pathlib.Path(s3cachedir.name) + keep_s3_cache=True, + s3cachedir=args.output_dir ) t0 = time.time() s3 = create_s3_client() for file_entry in jsonl_in_range(s3, conf, args.since, args.until): - dst_path = args.output_dir / file_entry.test_name / file_entry.country_code / f"{file_entry.timestamp:%Y-%m-%d}" / file_entry.filename - if dst_path.is_file(): + if not file_entry.matches_filter(ccs, testnames): continue - os.makedirs(dst_path.parent, exist_ok=True) - temp_path = dst_path.with_name(f"{dst_path.name}.tmp") - try: - with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: - jsonl_fns = [(file_entry.fullpath, file_entry.size)] - for msmt_tup in stream_measurements_from_files(s3, conf, jsonl_fns): - msmt = msmt_tup[1] - if args.max_string_size: - msmt = trim_measurement(msmt, args.max_string_size) - ujson.dump(msmt, out_file) - out_file.write("\n") - temp_path.replace(dst_path) - except: - temp_path.unlink() - s3cachedir.cleanup() - raise - - s3cachedir.cleanup() + mc = download_measurement_container(s3, conf, file_entry) + if args.max_string_size: + trim_container(conf, fe, args.max_string_size) def _parse_date_flag(date_str: str) -> dt.date: return dt.datetime.strptime(date_str, "%Y-%m-%d").date() diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 0b1ab867..7774771e 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -215,15 +215,15 @@ def output_path(self, dst_dir: Path) -> Path: dst_dir / self.test_name / self.country_code - / f"self.timestamp:%Y-%m-%d" - / file_entry.filename + / f"{self.timestamp:%Y-%m-%d}" + / self.filename ) def matches_filter(self, ccs: Set[str], testnames: Set[str]) -> bool: - if self.country_code and len(ccs) > 0 and self.country_code not in ccs: + if self.country_code and ccs and self.country_code not in ccs: return False - if self.test_name and len(testnames) > 0 and self.test_name not in testnames: + if self.test_name and testnames and self.test_name not in testnames: return False return True @@ -409,6 +409,7 @@ def _cb(bytes_count): _cb.total_size = file_entry.size _cb.total_count = 0 + _cb.start_time = None diskf.parent.mkdir(parents=True, exist_ok=True) tmpf = diskf.with_suffix(".s3tmp") From 64fdcdc97e93c122fdd64e5987eb3af437145da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 20:01:27 +0100 Subject: [PATCH 18/49] Small cosmetic improvements to the CLI --- oonidata/oonidata/main.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 862cff2a..56562025 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -63,13 +63,12 @@ def trim_container(conf, fe: FileEntry, max_string_size: int): def sync(args): testnames = [] - ccs = [] - if args.test_name: - testnames = list(map(lambda x: x.replace("_", ""), args.test_name)) - if args.country: - ccs = args.country + if args.test_names: + # Replace _ with a - + testnames = list(map(lambda x: x.replace("_", ""), args.test_names)) + conf = Config( - ccs=ccs, + ccs=args.country_codes, testnames=testnames, keep_s3_cache=True, s3cachedir=args.output_dir @@ -93,12 +92,12 @@ def main(): subparsers = parser.add_subparsers() parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") - parser_sync.add_argument("--country", type=str, nargs="*") + parser_sync.add_argument("--country-codes", type=str, nargs="*", help="List of probe_cc values to filter by") parser_sync.add_argument("--since", type=_parse_date_flag, default=dt.date.today() - dt.timedelta(days=14)) parser_sync.add_argument("--until", type=_parse_date_flag, default=dt.date.today()) - parser_sync.add_argument("--test-name", nargs="*") + parser_sync.add_argument("--test-names", nargs="*", help="List of test_name values to filter by") parser_sync.add_argument("--max-string-size", type=int) parser_sync.add_argument("--output-dir", type=pathlib.Path, required=True) parser_sync.add_argument("--debug", action="store_true") From b450860641d9765587e1cff7ac693dd6fedcf65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 20:02:02 +0100 Subject: [PATCH 19/49] Reflow with black --- oonidata/oonidata/main.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 56562025..74c40f35 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -29,6 +29,7 @@ def trim_measurement(json_obj, max_string_size: int): return json_obj + @trim_measurement.register(dict) def _(json_dict: dict, max_string_size: int): keys_to_delete: List[str] = [] @@ -41,17 +42,21 @@ def _(json_dict: dict, max_string_size: int): del json_dict[key] return json_dict + @trim_measurement.register(list) def _(json_list: list, max_string_size: int): for item in json_list: trim_measurement(item, max_string_size) return json_list + def trim_container(conf, fe: FileEntry, max_string_size: int): mc = fe.output_path(conf.s3cachedir) temp_path = diskf.with_suffix(".tmp") try: - with gzip.open(temp_path, mode="wt", encoding="utf-8", newline="\n") as out_file: + with gzip.open( + temp_path, mode="wt", encoding="utf-8", newline="\n" + ) as out_file: for msmt in load_multiple(mc.as_posix()): msmt = trim_measurement(msmt, args.max_string_size) ujson.dump(msmt, out_file) @@ -61,6 +66,7 @@ def trim_container(conf, fe: FileEntry, max_string_size: int): temp_path.unlink() raise + def sync(args): testnames = [] if args.test_names: @@ -71,7 +77,7 @@ def sync(args): ccs=args.country_codes, testnames=testnames, keep_s3_cache=True, - s3cachedir=args.output_dir + s3cachedir=args.output_dir, ) t0 = time.time() s3 = create_s3_client() @@ -82,9 +88,11 @@ def sync(args): if args.max_string_size: trim_container(conf, fe, args.max_string_size) + def _parse_date_flag(date_str: str) -> dt.date: return dt.datetime.strptime(date_str, "%Y-%m-%d").date() + def main(): parser = argparse.ArgumentParser("OONI Data tools") parser.set_defaults(func=lambda r: parser.print_usage()) @@ -92,12 +100,21 @@ def main(): subparsers = parser.add_subparsers() parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") - parser_sync.add_argument("--country-codes", type=str, nargs="*", help="List of probe_cc values to filter by") - parser_sync.add_argument("--since", type=_parse_date_flag, - default=dt.date.today() - dt.timedelta(days=14)) - parser_sync.add_argument("--until", type=_parse_date_flag, - default=dt.date.today()) - parser_sync.add_argument("--test-names", nargs="*", help="List of test_name values to filter by") + parser_sync.add_argument( + "--country-codes", + type=str, + nargs="*", + help="List of probe_cc values to filter by", + ) + parser_sync.add_argument( + "--since", + type=_parse_date_flag, + default=dt.date.today() - dt.timedelta(days=14), + ) + parser_sync.add_argument("--until", type=_parse_date_flag, default=dt.date.today()) + parser_sync.add_argument( + "--test-names", nargs="*", help="List of test_name values to filter by" + ) parser_sync.add_argument("--max-string-size", type=int) parser_sync.add_argument("--output-dir", type=pathlib.Path, required=True) parser_sync.add_argument("--debug", action="store_true") @@ -106,5 +123,6 @@ def main(): args = parser.parse_args() sys.exit(args.func(args)) + if __name__ == "__main__": main() From a271bed7651b475432ef456ad86fc453e9c14ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 15 Feb 2022 20:03:07 +0100 Subject: [PATCH 20/49] Fix typo --- oonidata/oonidata/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 74c40f35..abbcd8c6 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -82,7 +82,7 @@ def sync(args): t0 = time.time() s3 = create_s3_client() for file_entry in jsonl_in_range(s3, conf, args.since, args.until): - if not file_entry.matches_filter(ccs, testnames): + if not file_entry.matches_filter(conf.ccs, conf.testnames): continue mc = download_measurement_container(s3, conf, file_entry) if args.max_string_size: From 589f9c88be6d5e6054f7c62b92b9a2478ca24a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 11:27:17 +0100 Subject: [PATCH 21/49] Simplify jsonl listing --- oonidata/oonidata/s3feeder.py | 38 +++++++++-------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 7774771e..2ca236ab 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -258,46 +258,26 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: yield file_entry -def _legacy_jsonl_on_s3_for_a_day( - s3, day: date, country_codes: Set[str], test_names: Set[str] -) -> list: - tstamp = day.strftime("%Y%m%d") - prefix = f"raw/{tstamp}/" - files = [] - for file_entry in iter_file_entries(s3, prefix): - if file_entry.ext != "jsonl.gz": - continue - - if not file_entry.matches_filter(country_codes, test_names): - continue - - if file_entry.size > 0: - yield file_entry - - def jsonl_in_range( s3, conf, start_day: date, end_day: date ) -> Generator[FileEntry, None, None]: - # List all the jsonl file entries in the old bucket format - for day in date_interval(max(date(2020, 10, 20), start_day), end_day): - for fe in _legacy_jsonl_on_s3_for_a_day(s3, day, conf.ccs, conf.testnames): - yield fe - + legacy_prefixes = [ + f"raw/{d:%Y%m%d}" + for d in date_interval(max(date(2020, 10, 20), start_day), end_day) + ] prefixes = ["jsonl/"] # We have both a testname list and a country code list, we can efficiently # pre-filter based on prefix - if len(conf.testnames) > 0 and len(conf.ccs) > 0: - c = itertools.product( - conf.testnames, conf.ccs, date_interval(start_day, date(2020, 10, 21)) - ) - prefixes = [f"jsonl/{tn}/{cc}/{ts}" for cc, tn, ts in c] + if conf.testnames and conf.ccs: + c = itertools.product(conf.testnames, conf.ccs) + prefixes = [f"jsonl/{tn}/{cc}/" for cc, tn in c] - elif len(conf.testnames): + elif conf.testnames: prefixes = [f"jsonl/{tn}/" for tn in conf.testnames] # In other cases, we are going to have to list all the bucket and do # filtering based on filepath - for p in prefixes: + for p in prefixes + legacy_prefixes: for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": log.warn( From 9e78b6a99449668ea11ab29493c10fe07b5c387f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 11:48:47 +0100 Subject: [PATCH 22/49] Use day instead of timestamp --- oonidata/oonidata/s3feeder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 2ca236ab..39b180e8 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -188,7 +188,7 @@ def iter_cans_on_s3_for_a_day(s3, day: date): continue file_entry = FileEntry( - timestamp=day, + day=day, country_code=country_code, test_name=test_name, filename=filename, @@ -201,7 +201,7 @@ def iter_cans_on_s3_for_a_day(s3, day: date): class FileEntry(NamedTuple): - timestamp: Any + day: date country_code: Any test_name: str filename: str @@ -215,7 +215,7 @@ def output_path(self, dst_dir: Path) -> Path: dst_dir / self.test_name / self.country_code - / f"{self.timestamp:%Y-%m-%d}" + / f"{self.day:%Y-%m-%d}" / self.filename ) @@ -246,7 +246,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( - timestamp=datetime.strptime(parts[0], "%Y%m%d%H"), + day=datetime.strptime(parts[0], "%Y%m%d%H").day(), country_code=parts[1], test_name=test_name, filename=filename, @@ -288,7 +288,7 @@ def jsonl_in_range( if not file_entry.matches_filter(conf.ccs, conf.testnames): continue - if file_entry.timestamp < start_day or file_entry.timestamp >= end_day: + if file_entry.day < start_day or file_entry.day >= end_day: continue if file_entry.size > 0: From 849610f374a51755ff00bffdf155fd414fc205f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 11:51:42 +0100 Subject: [PATCH 23/49] Fix typo --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 39b180e8..6aca80ac 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -246,7 +246,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( - day=datetime.strptime(parts[0], "%Y%m%d%H").day(), + day=datetime.strptime(parts[0], "%Y%m%d%H").date(), country_code=parts[1], test_name=test_name, filename=filename, From bbf8e8277de897429b71ed44ea9fc46d6a60cc16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 11:58:45 +0100 Subject: [PATCH 24/49] Fix parsing in s3feeder --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 6aca80ac..542c677f 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -246,7 +246,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( - day=datetime.strptime(parts[0], "%Y%m%d%H").date(), + day=datetime.strptime(parts[0], "%Y%m%d").date(), country_code=parts[1], test_name=test_name, filename=filename, From f9c2cf25f9ad952669a862d8ab36fdda39a2f70d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 12:08:35 +0100 Subject: [PATCH 25/49] Bugfix related to inconsistent filename in legacy jsonl vs new jsonl filenames --- oonidata/oonidata/s3feeder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 542c677f..ea113227 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -246,7 +246,9 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: parts = filename.split("_") test_name, _, _, ext = parts[2].split(".", 3) file_entry = FileEntry( - day=datetime.strptime(parts[0], "%Y%m%d").date(), + # We need to truncate the first 8 chars, because of + # inconsitencies between the old and new filenames + day=datetime.strptime(parts[0][:8], "%Y%m%d").date(), country_code=parts[1], test_name=test_name, filename=filename, From 6138e11c8bd9e0d951336476e2411d0fd34da28c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 12:11:18 +0100 Subject: [PATCH 26/49] Fix log line --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index ea113227..7ee3f17e 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -446,7 +446,7 @@ def stream_jsonl( conf, start_day: date, end_day: date ) -> Generator[MsmtTup, None, None]: """Stream jsonl from S3""" - log.info("Fetching older cans from S3") + log.info("Fetching jsonl from S3") s3 = create_s3_client() yield from stream_measurements( s3, conf, jsonl_in_range(s3, conf, start_day, end_day) From 0af8a04b1a63b6ecbd8fabc3725fae126f8c5fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 12:40:09 +0100 Subject: [PATCH 27/49] Include in listing yaml.lz4 files --- oonidata/oonidata/s3feeder.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 7ee3f17e..8054e225 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -177,11 +177,11 @@ def iter_cans_on_s3_for_a_day(s3, day: date): if filename.endswith(".tar.lz4"): test_name = filename.split(".")[0].replace("_", "") ext = "tar.lz4" - elif filename.endswith(".json.lz4"): + elif filename.endswith(".json.lz4") or filename.endswith(".yaml.lz4"): parts = filename.split("-") country_code = parts[1] test_name = parts[3].replace("_", "") - ext = "json.lz4" + ext = ".".join(filename.split(".")[-2:]) else: if filename != "index.json.gz": log.warn(f"found an unexpected filename {filename}") @@ -318,16 +318,11 @@ def iter_minicans_on_s3_for_a_day(s3, day: date) -> Generator[FileEntry, None, N # s3cmd ls s3://ooni-data-eu-fra/raw/20210202 tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" - files = [] for file_entry in iter_file_entries(s3, prefix): if not file_entry.ext != "tar.gz": continue yield file_entry - if (day >= date(2020, 10, 20)) ^ len(files) > 0: - # The first day with minicans is 2020-10-20 - log.warn("%d minican files found!", len(files)) - def _calculate_etr(t0, now, start_day, day, stop_day, can_num, can_tot_count) -> int: """Estimate total runtime in seconds. From f76d8fe7982096c47ced374db35b7c08309bffff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 12:45:52 +0100 Subject: [PATCH 28/49] Bugfixing of listing for legacy cans --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 8054e225..e328c7f4 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -195,7 +195,7 @@ def iter_cans_on_s3_for_a_day(s3, day: date): size=f["Size"], ext=ext, s3path=s3path, - bucket_name=MC_BUCKET_NAME, + bucket_name=CAN_BUCKET_NAME, ) yield file_entry From 25d9bd73eef0153a0db6c653bd49867db19731f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 13:00:13 +0100 Subject: [PATCH 29/49] Use XX as unknow country code as key for cans --- oonidata/oonidata/s3feeder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index e328c7f4..d60ac07f 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -176,6 +176,7 @@ def iter_cans_on_s3_for_a_day(s3, day: date): ext = None if filename.endswith(".tar.lz4"): test_name = filename.split(".")[0].replace("_", "") + country_code = "XX" ext = "tar.lz4" elif filename.endswith(".json.lz4") or filename.endswith(".yaml.lz4"): parts = filename.split("-") From 8f2afc2e84138125d74a0494615eee9be6e54234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 16 Feb 2022 13:45:39 +0100 Subject: [PATCH 30/49] Don't display warning for non jsonl --- oonidata/oonidata/s3feeder.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index d60ac07f..29ff962a 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -203,7 +203,7 @@ def iter_cans_on_s3_for_a_day(s3, day: date): class FileEntry(NamedTuple): day: date - country_code: Any + country_code: str test_name: str filename: str size: int @@ -283,9 +283,6 @@ def jsonl_in_range( for p in prefixes + legacy_prefixes: for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": - log.warn( - f"Found non jsonl.gz file in jsonl prefix: {file_entry.s3path}" - ) continue if not file_entry.matches_filter(conf.ccs, conf.testnames): From cea7bb0702f6cd6ed4fed44de1442bc95e2bba3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 17 Feb 2022 11:53:35 +0100 Subject: [PATCH 31/49] Boost performance of the jsonl_in_range function * Write some integration tests for the s3feeder --- oonidata/oonidata/main.py | 2 +- oonidata/oonidata/s3feeder.py | 67 +++++++++++++++++------- oonidata/oonidata/tests/test_s3feeder.py | 40 ++++++++++++++ 3 files changed, 90 insertions(+), 19 deletions(-) create mode 100644 oonidata/oonidata/tests/test_s3feeder.py diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index abbcd8c6..e51cbd0a 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -81,7 +81,7 @@ def sync(args): ) t0 = time.time() s3 = create_s3_client() - for file_entry in jsonl_in_range(s3, conf, args.since, args.until): + for file_entry in jsonl_in_range(s3, conf.ccs, conf.testnames, args.since, args.until): if not file_entry.matches_filter(conf.ccs, conf.testnames): continue mc = download_measurement_container(s3, conf, file_entry) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 29ff962a..7e87cee4 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -10,9 +10,10 @@ """ from datetime import date, timedelta, datetime -from typing import Generator, Set, NamedTuple, Any +from typing import Generator, Set, NamedTuple, Any, List from collections import namedtuple from pathlib import Path +import itertools import logging import os import time @@ -261,34 +262,56 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: yield file_entry +def list_all_testnames(s3) -> Set[str]: + testnames = set() + paginator = s3.get_paginator("list_objects_v2") + for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix="jsonl/", Delimiter="/"): + for f in r.get("CommonPrefixes", []): + testnames.add(f["Prefix"].split("/")[-2]) + return testnames + +def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: + """ + get_search_prefixes will return all the prefixes inside of the new jsonl + bucket that match the given testnames and ccs. + If the ccs list is empty we will return prefixes for all countries for + which that particular testname as measurements. + """ + prefixes = [] + paginator = s3.get_paginator("list_objects_v2") + for tn in testnames: + for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=f"jsonl/{tn}/", Delimiter="/"): + for f in r.get("CommonPrefixes", []): + prefix = f["Prefix"] + cc = prefix.split("/")[-2] + if ccs and cc not in ccs: + continue + prefixes.append(prefix) + return prefixes + def jsonl_in_range( - s3, conf, start_day: date, end_day: date + s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date ) -> Generator[FileEntry, None, None]: legacy_prefixes = [ f"raw/{d:%Y%m%d}" for d in date_interval(max(date(2020, 10, 20), start_day), end_day) ] - prefixes = ["jsonl/"] - # We have both a testname list and a country code list, we can efficiently - # pre-filter based on prefix - if conf.testnames and conf.ccs: - c = itertools.product(conf.testnames, conf.ccs) - prefixes = [f"jsonl/{tn}/{cc}/" for cc, tn in c] - - elif conf.testnames: - prefixes = [f"jsonl/{tn}/" for tn in conf.testnames] - - # In other cases, we are going to have to list all the bucket and do - # filtering based on filepath + if not testnames: + testnames = list_all_testnames(s3) + search_prefixes = get_search_prefixes(s3, testnames, ccs) + + c = itertools.product(search_prefixes, date_interval(start_day, end_day)) + prefixes = [f"{p}{d:%Y%m%d}" for p, d in c] + for p in prefixes + legacy_prefixes: for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": continue - if not file_entry.matches_filter(conf.ccs, conf.testnames): + if not file_entry.matches_filter(ccs, testnames): continue - if file_entry.day < start_day or file_entry.day >= end_day: + if not (file_entry.day < start_day or file_entry.day >= end_day): continue if file_entry.size > 0: @@ -359,6 +382,8 @@ def date_interval(start_day: date, end_day: date): @metrics.timer("download_measurement_container") def download_measurement_container(s3, conf, file_entry: FileEntry): + s3_config = TransferConfig(max_concurrency=10, use_threads=True) + diskf = file_entry.output_path(conf.s3cachedir) if diskf.exists() and file_entry.size == diskf.stat().st_size: metrics.incr("cache_hit") @@ -389,7 +414,13 @@ def _cb(bytes_count): diskf.parent.mkdir(parents=True, exist_ok=True) tmpf = diskf.with_suffix(".s3tmp") with tmpf.open("wb") as f: - s3.download_fileobj(file_entry.bucket_name, file_entry.s3path, f, Callback=_cb) + s3.download_fileobj( + file_entry.bucket_name, + file_entry.s3path, + f, + Config=s3_config, + Callback=_cb + ) f.flush() os.fsync(f.fileno()) metrics.gauge("fetching", 0) @@ -442,7 +473,7 @@ def stream_jsonl( log.info("Fetching jsonl from S3") s3 = create_s3_client() yield from stream_measurements( - s3, conf, jsonl_in_range(s3, conf, start_day, end_day) + s3, conf, jsonl_in_range(s3, conf.ccs, conf.testnames, start_day, end_day) ) if end_day: diff --git a/oonidata/oonidata/tests/test_s3feeder.py b/oonidata/oonidata/tests/test_s3feeder.py new file mode 100644 index 00000000..5bc30a68 --- /dev/null +++ b/oonidata/oonidata/tests/test_s3feeder.py @@ -0,0 +1,40 @@ +import pytest +import time + +from datetime import date +from oonidata.s3feeder import iter_file_entries, create_s3_client +from oonidata.s3feeder import iter_cans_on_s3_for_a_day, jsonl_in_range + +@pytest.fixture +def s3(): + return create_s3_client() + +def test_iter_file_entries_new_jsonl(s3): + fe_list = list(iter_file_entries(s3, "jsonl/webconnectivity/IT/20201020/00/")) + assert len(fe_list) == 19 + for fe in fe_list: + assert fe.test_name == "webconnectivity" + assert fe.country_code == "IT" + assert fe.size > 0 + assert fe.bucket_name == "ooni-data-eu-fra" + assert fe.day == date(2020, 10, 20) + assert fe.ext == "jsonl.gz" + +def test_iter_file_entries_old_format(s3): + fe_list = list(iter_file_entries(s3, "raw/20211020/00/IT/webconnectivity/")) + assert len(fe_list) == 6 + for fe in fe_list: + assert fe.test_name == "webconnectivity" + assert fe.country_code == "IT" + assert fe.size > 0 + assert fe.bucket_name == "ooni-data-eu-fra" + assert fe.day == date(2021, 10, 20) + +def test_iter_cans_on_s3_for_a_day(s3): + fe_list = list(iter_cans_on_s3_for_a_day(s3, date(2020, 1, 1))) + assert len(fe_list) == 136 + assert all(map(lambda fe: fe.bucket_name == "ooni-data", fe_list)) + +def test_jsonl_in_range(s3): + fe_list = list(jsonl_in_range(s3, [], [], date(2020, 1, 1), date(2020, 1, 2))) + assert len(fe_list) == 1125 From bee5679672f8fbc88259481f3b4d7b54fd3d72fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 17 Feb 2022 13:22:46 +0100 Subject: [PATCH 32/49] Drop TransferConfig --- oonidata/oonidata/s3feeder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 7e87cee4..fcd0dd40 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -382,8 +382,6 @@ def date_interval(start_day: date, end_day: date): @metrics.timer("download_measurement_container") def download_measurement_container(s3, conf, file_entry: FileEntry): - s3_config = TransferConfig(max_concurrency=10, use_threads=True) - diskf = file_entry.output_path(conf.s3cachedir) if diskf.exists() and file_entry.size == diskf.stat().st_size: metrics.incr("cache_hit") @@ -418,7 +416,6 @@ def _cb(bytes_count): file_entry.bucket_name, file_entry.s3path, f, - Config=s3_config, Callback=_cb ) f.flush() From b92ea29c935da41cbf3f8abae3af8c68ee101e7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 17 Feb 2022 18:04:22 +0100 Subject: [PATCH 33/49] Don't perform listing optimisations for ranges larger than 20 days --- oonidata/oonidata/s3feeder.py | 39 +++++++++++++++--------- oonidata/oonidata/tests/test_s3feeder.py | 8 +++-- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index fcd0dd40..59a4f03b 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -270,6 +270,7 @@ def list_all_testnames(s3) -> Set[str]: testnames.add(f["Prefix"].split("/")[-2]) return testnames + def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: """ get_search_prefixes will return all the prefixes inside of the new jsonl @@ -280,7 +281,9 @@ def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: prefixes = [] paginator = s3.get_paginator("list_objects_v2") for tn in testnames: - for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=f"jsonl/{tn}/", Delimiter="/"): + for r in paginator.paginate( + Bucket=MC_BUCKET_NAME, Prefix=f"jsonl/{tn}/", Delimiter="/" + ): for f in r.get("CommonPrefixes", []): prefix = f["Prefix"] cc = prefix.split("/")[-2] @@ -289,21 +292,32 @@ def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: prefixes.append(prefix) return prefixes -def jsonl_in_range( - s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date -) -> Generator[FileEntry, None, None]: + +def get_jsonl_prefixes( + s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date +) -> List[str]: legacy_prefixes = [ f"raw/{d:%Y%m%d}" for d in date_interval(max(date(2020, 10, 20), start_day), end_day) ] if not testnames: testnames = list_all_testnames(s3) - search_prefixes = get_search_prefixes(s3, testnames, ccs) + prefixes = get_search_prefixes(s3, testnames, ccs) + + # This results in a faster listing in cases where we need only a small time + # windows. For larger windows of time, we are better off just listing + # everything. + if (end_day - start_day).days < 20: + c = itertools.product(prefixes, date_interval(start_day, end_day)) + prefixes = [f"{p}{d:%Y%m%d}" for p, d in c] + return prefixes + legacy_prefixes - c = itertools.product(search_prefixes, date_interval(start_day, end_day)) - prefixes = [f"{p}{d:%Y%m%d}" for p, d in c] - for p in prefixes + legacy_prefixes: +def jsonl_in_range( + s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date +) -> Generator[FileEntry, None, None]: + + for p in get_jsonl_prefixes(s3, ccs, testnames, start_day, end_day): for file_entry in iter_file_entries(s3, p): if file_entry.ext != "jsonl.gz": continue @@ -311,7 +325,7 @@ def jsonl_in_range( if not file_entry.matches_filter(ccs, testnames): continue - if not (file_entry.day < start_day or file_entry.day >= end_day): + if file_entry.day < start_day or file_entry.day >= end_day: continue if file_entry.size > 0: @@ -412,12 +426,7 @@ def _cb(bytes_count): diskf.parent.mkdir(parents=True, exist_ok=True) tmpf = diskf.with_suffix(".s3tmp") with tmpf.open("wb") as f: - s3.download_fileobj( - file_entry.bucket_name, - file_entry.s3path, - f, - Callback=_cb - ) + s3.download_fileobj(file_entry.bucket_name, file_entry.s3path, f, Callback=_cb) f.flush() os.fsync(f.fileno()) metrics.gauge("fetching", 0) diff --git a/oonidata/oonidata/tests/test_s3feeder.py b/oonidata/oonidata/tests/test_s3feeder.py index 5bc30a68..64da3a2d 100644 --- a/oonidata/oonidata/tests/test_s3feeder.py +++ b/oonidata/oonidata/tests/test_s3feeder.py @@ -2,7 +2,7 @@ import time from datetime import date -from oonidata.s3feeder import iter_file_entries, create_s3_client +from oonidata.s3feeder import iter_file_entries, create_s3_client, get_jsonl_prefixes from oonidata.s3feeder import iter_cans_on_s3_for_a_day, jsonl_in_range @pytest.fixture @@ -33,7 +33,11 @@ def test_iter_file_entries_old_format(s3): def test_iter_cans_on_s3_for_a_day(s3): fe_list = list(iter_cans_on_s3_for_a_day(s3, date(2020, 1, 1))) assert len(fe_list) == 136 - assert all(map(lambda fe: fe.bucket_name == "ooni-data", fe_list)) + assert all(map(lambda fe: fe.bucket_name == "ooni-data", fe_list)) + +def test_get_jsonl_prefixes(s3): + prefixes = list(get_jsonl_prefixes(s3, [], [], date(2020, 1, 1), date(2020, 1, 2))) + assert len(prefixes) == 2516 def test_jsonl_in_range(s3): fe_list = list(jsonl_in_range(s3, [], [], date(2020, 1, 1), date(2020, 1, 2))) From b6be94784fda2f8ca87be9616e261f04740e99d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 17 Feb 2022 19:55:46 +0100 Subject: [PATCH 34/49] Add support for parallel listing and download of data * Using parallelisation and sharing of the s3 client we get a 10x performance boost * Add progress bar to oonidata sync command via tqdm --- oonidata/oonidata/main.py | 32 +++---- oonidata/oonidata/s3feeder.py | 110 +++++++++++++---------- oonidata/oonidata/tests/test_s3feeder.py | 35 +++++--- oonidata/setup.py | 1 + 4 files changed, 99 insertions(+), 79 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index e51cbd0a..e8e1f1bf 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,11 +15,12 @@ import ujson +from tqdm import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm + from .s3feeder import create_s3_client, FileEntry, download_measurement_container from .s3feeder import jsonl_in_range -Config = namedtuple("Config", ["ccs", "testnames", "keep_s3_cache", "s3cachedir"]) - log = logging.getLogger("oonidata") logging.basicConfig(level=logging.INFO) @@ -50,8 +51,8 @@ def _(json_list: list, max_string_size: int): return json_list -def trim_container(conf, fe: FileEntry, max_string_size: int): - mc = fe.output_path(conf.s3cachedir) +def trim_container(s3cachedir: pathlib.Path, fe: FileEntry, max_string_size: int): + mc = fe.output_path(s3cachedir) temp_path = diskf.with_suffix(".tmp") try: with gzip.open( @@ -73,20 +74,15 @@ def sync(args): # Replace _ with a - testnames = list(map(lambda x: x.replace("_", ""), args.test_names)) - conf = Config( - ccs=args.country_codes, - testnames=testnames, - keep_s3_cache=True, - s3cachedir=args.output_dir, - ) - t0 = time.time() - s3 = create_s3_client() - for file_entry in jsonl_in_range(s3, conf.ccs, conf.testnames, args.since, args.until): - if not file_entry.matches_filter(conf.ccs, conf.testnames): - continue - mc = download_measurement_container(s3, conf, file_entry) - if args.max_string_size: - trim_container(conf, fe, args.max_string_size) + log.info(f"Listing measurement in s3 for {args.since} - {args.until} probe_cc: {args.country_codes}") + log.info("This may take a while...") + + file_entries = list(jsonl_in_range(args.country_codes, testnames, args.since, args.until)) + with logging_redirect_tqdm(): + for file_entry in tqdm(file_entries): + mc = download_measurement_container(args.output_dir, file_entry) + if args.max_string_size: + trim_container(args.output_dir, fe, args.max_string_size) def _parse_date_flag(date_str: str) -> dt.date: diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 59a4f03b..d799142b 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -19,6 +19,7 @@ import time import gzip import tarfile +from multiprocessing import Pool import lz4.frame as lz4frame # debdeps: python3-lz4 import ujson @@ -36,10 +37,17 @@ CAN_BUCKET_NAME = "ooni-data" MC_BUCKET_NAME = "ooni-data-eu-fra" +MAX_PROCESS_COUNT = 24 log = logging.getLogger("fastpath") metrics = setup_metrics(name="fastpath.s3feeder") + +def create_s3_client(): + return boto3.client("s3", config=botoConfig(signature_version=botoSigUNSIGNED)) + +s3 = create_s3_client() + # suppress debug logs for x in ("urllib3", "botocore", "s3transfer"): logging.getLogger(x).setLevel(logging.INFO) @@ -150,17 +158,14 @@ def load_multiple(fn: str) -> Generator[MsmtTup, None, None]: raise RuntimeError(f"Unexpected [mini]can filename '{fn}'") -def create_s3_client(): - return boto3.client("s3", config=botoConfig(signature_version=botoSigUNSIGNED)) - -def list_cans_on_s3_for_a_day(s3, day: date) -> list: +def list_cans_on_s3_for_a_day(day: date) -> list: return list( - map(lambda fe: (fe.s3path, fe.size), iter_cans_on_s3_for_a_day(s3, day)) + map(lambda fe: (fe.s3path, fe.size), iter_cans_on_s3_for_a_day(day)) ) -def iter_cans_on_s3_for_a_day(s3, day: date): +def iter_cans_on_s3_for_a_day(day: date): """List legacy cans.""" prefix = f"canned/{day}/" paginator = s3.get_paginator("list_objects_v2") @@ -239,7 +244,7 @@ def log_download(self) -> None: log.info(f"Downloading can {self.s3path} size {s:.1f} {d}B") -def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: +def iter_file_entries(prefix: str) -> Generator[FileEntry, None, None]: paginator = s3.get_paginator("list_objects_v2") for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix=prefix): for f in r.get("Contents", []): @@ -262,7 +267,7 @@ def iter_file_entries(s3, prefix: str) -> Generator[FileEntry, None, None]: yield file_entry -def list_all_testnames(s3) -> Set[str]: +def list_all_testnames() -> Set[str]: testnames = set() paginator = s3.get_paginator("list_objects_v2") for r in paginator.paginate(Bucket=MC_BUCKET_NAME, Prefix="jsonl/", Delimiter="/"): @@ -271,7 +276,7 @@ def list_all_testnames(s3) -> Set[str]: return testnames -def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: +def get_search_prefixes(testnames: Set[str], ccs: Set[str]) -> List[str]: """ get_search_prefixes will return all the prefixes inside of the new jsonl bucket that match the given testnames and ccs. @@ -294,15 +299,15 @@ def get_search_prefixes(s3, testnames: Set[str], ccs: Set[str]) -> List[str]: def get_jsonl_prefixes( - s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date + ccs: Set[str], testnames: Set[str], start_day: date, end_day: date ) -> List[str]: legacy_prefixes = [ f"raw/{d:%Y%m%d}" for d in date_interval(max(date(2020, 10, 20), start_day), end_day) ] if not testnames: - testnames = list_all_testnames(s3) - prefixes = get_search_prefixes(s3, testnames, ccs) + testnames = list_all_testnames() + prefixes = get_search_prefixes(testnames, ccs) # This results in a faster listing in cases where we need only a small time # windows. For larger windows of time, we are better off just listing @@ -312,48 +317,56 @@ def get_jsonl_prefixes( prefixes = [f"{p}{d:%Y%m%d}" for p, d in c] return prefixes + legacy_prefixes +def list_file_entries(prefix): + return [fe for fe in iter_file_entries(prefix)] def jsonl_in_range( - s3, ccs: Set[str], testnames: Set[str], start_day: date, end_day: date + ccs: Set[str], testnames: Set[str], start_day: date, end_day: date ) -> Generator[FileEntry, None, None]: - for p in get_jsonl_prefixes(s3, ccs, testnames, start_day, end_day): - for file_entry in iter_file_entries(s3, p): - if file_entry.ext != "jsonl.gz": - continue + prefixes = get_jsonl_prefixes(ccs, testnames, start_day, end_day) + with Pool(processes=MAX_PROCESS_COUNT) as pool: + fe = pool.imap_unordered( + list_file_entries, + prefixes + ) + for fe_list in fe: + for file_entry in fe_list: + if file_entry.ext != "jsonl.gz": + continue - if not file_entry.matches_filter(ccs, testnames): - continue + if not file_entry.matches_filter(ccs, testnames): + continue - if file_entry.day < start_day or file_entry.day >= end_day: - continue + if file_entry.day < start_day or file_entry.day >= end_day: + continue - if file_entry.size > 0: - yield file_entry + if file_entry.size > 0: + yield file_entry def list_minicans_on_s3_for_a_day( - s3, day: date, ccs: Set[str], testnames: Set[str] + day: date, ccs: Set[str], testnames: Set[str] ) -> list: return list( map( lambda fe: (fe.s3path, fe.size), filter( lambda fe: fe.matches_filter(ccs, testnames), - iter_minicans_on_s3_for_a_day(s3, day), + iter_minicans_on_s3_for_a_day(day), ), ) ) -def iter_minicans_on_s3_for_a_day(s3, day: date) -> Generator[FileEntry, None, None]: +def iter_minicans_on_s3_for_a_day(day: date) -> Generator[FileEntry, None, None]: """List minicans. Filter them by CCs and testnames Testnames are without underscores. """ # s3cmd ls s3://ooni-data-eu-fra/raw/20210202 tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" - for file_entry in iter_file_entries(s3, prefix): + for file_entry in iter_file_entries(prefix): if not file_entry.ext != "tar.gz": continue yield file_entry @@ -395,8 +408,8 @@ def date_interval(start_day: date, end_day: date): @metrics.timer("download_measurement_container") -def download_measurement_container(s3, conf, file_entry: FileEntry): - diskf = file_entry.output_path(conf.s3cachedir) +def download_measurement_container(s3cachedir: Path, file_entry: FileEntry): + diskf = file_entry.output_path(s3cachedir) if diskf.exists() and file_entry.size == diskf.stat().st_size: metrics.incr("cache_hit") diskf.touch(exist_ok=True) @@ -437,35 +450,38 @@ def _cb(bytes_count): def stream_measurements( - s3, conf, file_entries: Generator[FileEntry, None, None] + file_entries: Generator[FileEntry, None, None], + s3cachedir: Path, keep_s3_cache: bool, ) -> Generator[MsmtTup, None, None]: - for fe in file_entries: - if not fe.matches_filter(conf.ccs, conf.testnames): - continue - mc = download_measurement_container(s3, conf, fe) - try: - yield from load_multiple(mc.as_posix()) - except Exception as e: - log.error(str(e), exc_info=True) - if not conf.keep_s3_cache: + + with Pool(processes=MAX_PROCESS_COUNT) as pool: + mc_list = pool.starmap( + download_measurement_container, + zip(itertools.repeat(s3cachedir, len(file_entries)), file_entries) + ) + for mc in mc_list: try: - mc.unlink() - except FileNotFoundError: - pass + yield from load_multiple(mc.as_posix()) + except Exception as e: + log.error(str(e), exc_info=True) + if not keep_s3_cache: + try: + mc.unlink() + except FileNotFoundError: + pass def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: """Stream cans from S3""" log.info("Fetching older cans from S3") t0 = time.time() - s3 = create_s3_client() for day in date_interval(start_day, end_day): log.info("Processing day %s", day) can_file_entries = itertools.chain( - iter_cans_on_s3_for_a_day(s3, day), iter_minicans_on_s3_for_a_day(s3, day) + iter_cans_on_s3_for_a_day(day), iter_minicans_on_s3_for_a_day(day) ) - yield from stream_measurements(s3, conf, can_file_entries) + yield from stream_measurements(can_file_entries, conf.s3cachedir, conf.keep_s3_cache) if end_day: log.info(f"Reached {end_day}, streaming cans from S3 finished") @@ -477,9 +493,9 @@ def stream_jsonl( ) -> Generator[MsmtTup, None, None]: """Stream jsonl from S3""" log.info("Fetching jsonl from S3") - s3 = create_s3_client() yield from stream_measurements( - s3, conf, jsonl_in_range(s3, conf.ccs, conf.testnames, start_day, end_day) + jsonl_in_range(conf.ccs, conf.testnames, start_day, end_day), + conf.s3cachedir, conf.keep_s3_cache ) if end_day: diff --git a/oonidata/oonidata/tests/test_s3feeder.py b/oonidata/oonidata/tests/test_s3feeder.py index 64da3a2d..5392aab3 100644 --- a/oonidata/oonidata/tests/test_s3feeder.py +++ b/oonidata/oonidata/tests/test_s3feeder.py @@ -1,16 +1,16 @@ import pytest import time +from pathlib import Path from datetime import date + from oonidata.s3feeder import iter_file_entries, create_s3_client, get_jsonl_prefixes -from oonidata.s3feeder import iter_cans_on_s3_for_a_day, jsonl_in_range +from oonidata.s3feeder import iter_cans_on_s3_for_a_day, jsonl_in_range, list_file_entries +from oonidata.s3feeder import stream_measurements -@pytest.fixture -def s3(): - return create_s3_client() -def test_iter_file_entries_new_jsonl(s3): - fe_list = list(iter_file_entries(s3, "jsonl/webconnectivity/IT/20201020/00/")) +def test_iter_file_entries_new_jsonl(): + fe_list = list(iter_file_entries("jsonl/webconnectivity/IT/20201020/00/")) assert len(fe_list) == 19 for fe in fe_list: assert fe.test_name == "webconnectivity" @@ -20,8 +20,8 @@ def test_iter_file_entries_new_jsonl(s3): assert fe.day == date(2020, 10, 20) assert fe.ext == "jsonl.gz" -def test_iter_file_entries_old_format(s3): - fe_list = list(iter_file_entries(s3, "raw/20211020/00/IT/webconnectivity/")) +def test_iter_file_entries_old_format(): + fe_list = list(iter_file_entries("raw/20211020/00/IT/webconnectivity/")) assert len(fe_list) == 6 for fe in fe_list: assert fe.test_name == "webconnectivity" @@ -30,15 +30,22 @@ def test_iter_file_entries_old_format(s3): assert fe.bucket_name == "ooni-data-eu-fra" assert fe.day == date(2021, 10, 20) -def test_iter_cans_on_s3_for_a_day(s3): - fe_list = list(iter_cans_on_s3_for_a_day(s3, date(2020, 1, 1))) +def test_iter_cans_on_s3_for_a_day(): + fe_list = list(iter_cans_on_s3_for_a_day(date(2020, 1, 1))) assert len(fe_list) == 136 assert all(map(lambda fe: fe.bucket_name == "ooni-data", fe_list)) -def test_get_jsonl_prefixes(s3): - prefixes = list(get_jsonl_prefixes(s3, [], [], date(2020, 1, 1), date(2020, 1, 2))) +def test_get_jsonl_prefixes(): + prefixes = list(get_jsonl_prefixes([], [], date(2020, 1, 1), date(2020, 1, 2))) assert len(prefixes) == 2516 -def test_jsonl_in_range(s3): - fe_list = list(jsonl_in_range(s3, [], [], date(2020, 1, 1), date(2020, 1, 2))) +def test_jsonl_in_range(): + fe_list = list(jsonl_in_range([], [], date(2020, 1, 1), date(2020, 1, 2))) assert len(fe_list) == 1125 + +def test_stream_measurements(tmp_path): + fe_list = list_file_entries("jsonl/telegram/IT/20201009/00/") + assert len(fe_list) == 1 + for _, msmt, msmt_uid in stream_measurements(fe_list, tmp_path, False): + assert msmt["probe_cc"] == "IT" + assert msmt["test_name"] == "telegram" diff --git a/oonidata/setup.py b/oonidata/setup.py index e875ac42..4571a8f1 100644 --- a/oonidata/setup.py +++ b/oonidata/setup.py @@ -14,6 +14,7 @@ "boto3", "pyyaml", "ujson", + "tqdm", "lz4" ], zip_safe=False, From a371ce4c7a53ad961584c050eef8ee1d9c645737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Thu, 17 Feb 2022 20:27:57 +0100 Subject: [PATCH 35/49] Fix bug in minican listing --- oonidata/oonidata/s3feeder.py | 2 +- oonidata/oonidata/tests/test_s3feeder.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index d799142b..33b1afe6 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -367,7 +367,7 @@ def iter_minicans_on_s3_for_a_day(day: date) -> Generator[FileEntry, None, None] tstamp = day.strftime("%Y%m%d") prefix = f"raw/{tstamp}/" for file_entry in iter_file_entries(prefix): - if not file_entry.ext != "tar.gz": + if file_entry.ext != "tar.gz": continue yield file_entry diff --git a/oonidata/oonidata/tests/test_s3feeder.py b/oonidata/oonidata/tests/test_s3feeder.py index 5392aab3..f60f9f43 100644 --- a/oonidata/oonidata/tests/test_s3feeder.py +++ b/oonidata/oonidata/tests/test_s3feeder.py @@ -43,7 +43,7 @@ def test_jsonl_in_range(): fe_list = list(jsonl_in_range([], [], date(2020, 1, 1), date(2020, 1, 2))) assert len(fe_list) == 1125 -def test_stream_measurements(tmp_path): +def test_stream_jsonl_measurements(tmp_path): fe_list = list_file_entries("jsonl/telegram/IT/20201009/00/") assert len(fe_list) == 1 for _, msmt, msmt_uid in stream_measurements(fe_list, tmp_path, False): From 6fe7656ad0a2b75b7d3e52af37e826c5a1b39c8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 18 Feb 2022 18:37:49 +0100 Subject: [PATCH 36/49] Fix fastpath tests --- af/fastpath/fastpath/tests/test_functional.py | 5 ++--- af/fastpath/fastpath/tests/test_unit.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/af/fastpath/fastpath/tests/test_functional.py b/af/fastpath/fastpath/tests/test_functional.py index e6a2cea6..dd545391 100644 --- a/af/fastpath/fastpath/tests/test_functional.py +++ b/af/fastpath/fastpath/tests/test_functional.py @@ -143,7 +143,7 @@ def minicans(test_name, start_date: date, end_date: date, end=None): while day <= end_date: tn_filter = set([test_name.replace("_", "")]) log.info(day) - li = s3feeder.list_minicans_on_s3_for_a_day(s3, day, None, tn_filter) + li = s3feeder.list_minicans_on_s3_for_a_day(day, None, tn_filter) for s3fname, s3size in li: # s3fname: raw/20210426/23/YE/ndt/2021042623_YE_ndt.n0.0.tar.gz local_file = Path("testdata") / "mini" / s3fname @@ -167,8 +167,7 @@ def minicans(test_name, start_date: date, end_date: date, end=None): def list_cans_on_s3_for_a_day(day, filter=None, bysize=False): - s3 = s3feeder.create_s3_client() - fns = s3feeder.list_cans_on_s3_for_a_day(s3, day) + fns = s3feeder.list_cans_on_s3_for_a_day(day) if bysize: fns = sorted(fns, key=lambda i: i[1]) else: diff --git a/af/fastpath/fastpath/tests/test_unit.py b/af/fastpath/fastpath/tests/test_unit.py index ef9a1efc..baae2ae7 100644 --- a/af/fastpath/fastpath/tests/test_unit.py +++ b/af/fastpath/fastpath/tests/test_unit.py @@ -146,7 +146,7 @@ def test_score_tor(): "blocking_country": 0.0, "blocking_isp": 0.0, "blocking_local": 0.0, - "extra": {"test_runtime": 0.7671142980000001}, + "extra": {"test_runtime": 0.767114298}, } From 76ad460f15e3b47418e4c25cf166547179ef47c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 18 Feb 2022 18:49:41 +0100 Subject: [PATCH 37/49] Fix bug in unit test --- af/fastpath/fastpath/tests/test_unit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/af/fastpath/fastpath/tests/test_unit.py b/af/fastpath/fastpath/tests/test_unit.py index baae2ae7..a8b982b5 100644 --- a/af/fastpath/fastpath/tests/test_unit.py +++ b/af/fastpath/fastpath/tests/test_unit.py @@ -369,7 +369,7 @@ def test_get_http_header(): "location": "http://example2.com" }, "headers_list": [ - ["location", "http://example.com"] + ["location", "http://example.com"], ["location", "http://example2.com"] ], } From 95e72981d95a779b121619ed80c9bfd82239a590 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 18 Feb 2022 19:01:35 +0100 Subject: [PATCH 38/49] Adjust the listing heuristic --- oonidata/oonidata/s3feeder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 33b1afe6..bc6c3d7d 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -309,12 +309,12 @@ def get_jsonl_prefixes( testnames = list_all_testnames() prefixes = get_search_prefixes(testnames, ccs) + combos = list(itertools.product(prefixes, date_interval(start_day, end_day))) # This results in a faster listing in cases where we need only a small time - # windows. For larger windows of time, we are better off just listing - # everything. - if (end_day - start_day).days < 20: - c = itertools.product(prefixes, date_interval(start_day, end_day)) - prefixes = [f"{p}{d:%Y%m%d}" for p, d in c] + # window or few testnames. For larger windows of time, we are better off + # just listing everything. + if len(combos) > 1_000_000: # XXX we might want to tweak this parameter a bit + prefixes = [f"{p}{d:%Y%m%d}" for p, d in combos] return prefixes + legacy_prefixes def list_file_entries(prefix): From 98b1c76cfe57004efdf7e9d432c8ad9ebc3a5b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Fri, 18 Feb 2022 19:09:48 +0100 Subject: [PATCH 39/49] Fix bug spotted via unit tests --- af/fastpath/fastpath/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/af/fastpath/fastpath/core.py b/af/fastpath/fastpath/core.py index e17ace1f..8955a534 100644 --- a/af/fastpath/fastpath/core.py +++ b/af/fastpath/fastpath/core.py @@ -727,7 +727,7 @@ def get_http_header(resp, header_name, case_sensitive=False): # backward compatibility with older measurements that don't have # header_list - if "header_list" not in resp: + if "headers_list" not in resp: headers = resp.get("headers", {}) header_list = [[h,v] for h,v in headers.items()] else: From 917d65d19f340f7c536b7cd2646edba0dbee54da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Mon, 21 Feb 2022 16:59:51 +0100 Subject: [PATCH 40/49] Don't parallelise stream_measurements * Estimate ETA for stream_measurements --- oonidata/oonidata/s3feeder.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index bc6c3d7d..d621a75c 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -454,21 +454,26 @@ def stream_measurements( s3cachedir: Path, keep_s3_cache: bool, ) -> Generator[MsmtTup, None, None]: - with Pool(processes=MAX_PROCESS_COUNT) as pool: - mc_list = pool.starmap( - download_measurement_container, - zip(itertools.repeat(s3cachedir, len(file_entries)), file_entries) - ) - for mc in mc_list: + t0 = time.time() + total_size = sum(lambda fe: fe.size, file_entries) + processed_size = 0 + + for fe in file_entries: + mc = download_measurement_container(s3cachedir, fe) + try: + yield from load_multiple(mc.as_posix()) + except Exception as e: + log.error(str(e), exc_info=True) + processed_size += fe.size + mbps = processed_size / (time.time() - t0) / 1_000_000 + eta = timedelta(seconds=(total_size - processed_size)/(mbps * 1_000_000)) + log.info(f"Speed: {mbps} MB/s") + log.info(f"ETA: {eta}") + if not keep_s3_cache: try: - yield from load_multiple(mc.as_posix()) - except Exception as e: - log.error(str(e), exc_info=True) - if not keep_s3_cache: - try: - mc.unlink() - except FileNotFoundError: - pass + mc.unlink() + except FileNotFoundError: + pass def stream_cans(conf, start_day: date, end_day: date) -> Generator[MsmtTup, None, None]: From 8a9684467b1f3a8c07d96889e8eec20e044911cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Mon, 21 Feb 2022 19:08:27 +0100 Subject: [PATCH 41/49] Fix typo in stream_jsonl_measurements --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index d621a75c..46840245 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -455,7 +455,7 @@ def stream_measurements( ) -> Generator[MsmtTup, None, None]: t0 = time.time() - total_size = sum(lambda fe: fe.size, file_entries) + total_size = sum(map(lambda fe: fe.size, file_entries)) processed_size = 0 for fe in file_entries: From 79dd2495ee774a5f36fd4ac9c5e08be7b642b6c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 17:14:47 +0200 Subject: [PATCH 42/49] Only look inside the jsonl tree if we need to --- oonidata/oonidata/s3feeder.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index 46840245..cf8990bc 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -19,7 +19,7 @@ import time import gzip import tarfile -from multiprocessing import Pool +from multiprocessing import Pool, ThreadPool import lz4.frame as lz4frame # debdeps: python3-lz4 import ujson @@ -307,14 +307,16 @@ def get_jsonl_prefixes( ] if not testnames: testnames = list_all_testnames() - prefixes = get_search_prefixes(testnames, ccs) - - combos = list(itertools.product(prefixes, date_interval(start_day, end_day))) - # This results in a faster listing in cases where we need only a small time - # window or few testnames. For larger windows of time, we are better off - # just listing everything. - if len(combos) > 1_000_000: # XXX we might want to tweak this parameter a bit - prefixes = [f"{p}{d:%Y%m%d}" for p, d in combos] + prefixes = [] + if start_day < date(2020, 10, 21): + prefixes = get_search_prefixes(testnames, ccs) + combos = list(itertools.product(prefixes, date_interval(start_day, end_day))) + # This results in a faster listing in cases where we need only a small time + # window or few testnames. For larger windows of time, we are better off + # just listing everything. + if len(combos) > 1_000_000: # XXX we might want to tweak this parameter a bit + prefixes = [f"{p}{d:%Y%m%d}" for p, d in combos] + return prefixes + legacy_prefixes def list_file_entries(prefix): From 30a8319c5c6377d4ea07017e94dcbf7c9096b845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 17:16:40 +0200 Subject: [PATCH 43/49] Remove invalid import --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index cf8990bc..eb409ca4 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -19,7 +19,7 @@ import time import gzip import tarfile -from multiprocessing import Pool, ThreadPool +from multiprocessing import Pool import lz4.frame as lz4frame # debdeps: python3-lz4 import ujson From 73a5ca273f901fd2ebc9bc1f5c883e6c4887b308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 17:35:08 +0200 Subject: [PATCH 44/49] Add support for benchmarking threadpool vs processpool --- oonidata/oonidata/main.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index e8e1f1bf..6bde6920 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,6 +15,9 @@ import ujson +from multiprocessing import Pool +from multiprocessing.pool import ThreadPool + from tqdm import tqdm from tqdm.contrib.logging import logging_redirect_tqdm @@ -68,7 +71,19 @@ def trim_container(s3cachedir: pathlib.Path, fe: FileEntry, max_string_size: int raise +def download_and_trim(args): + def closure(fe): + mc = download_measurement_container(args.output_dir, fe) + if args.max_string_size: + trim_container(args.output_dir, fe, args.max_string_size) + return closure + def sync(args): + ChosenPool = ThreadPool + if args.use_process: + print("Using process pool") + ChosenPool = Pool + testnames = [] if args.test_names: # Replace _ with a - @@ -79,11 +94,9 @@ def sync(args): file_entries = list(jsonl_in_range(args.country_codes, testnames, args.since, args.until)) with logging_redirect_tqdm(): - for file_entry in tqdm(file_entries): - mc = download_measurement_container(args.output_dir, file_entry) - if args.max_string_size: - trim_container(args.output_dir, fe, args.max_string_size) - + with ChosenPool() as pool: + func = download_and_trim(args) + list(tqdm(pool.imap_unordered(func, file_entries), total=len(file_entries))) def _parse_date_flag(date_str: str) -> dt.date: return dt.datetime.strptime(date_str, "%Y-%m-%d").date() @@ -96,6 +109,10 @@ def main(): subparsers = parser.add_subparsers() parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") + parser_sync.add_argument( + "--use-process", + action="store_true" + ) parser_sync.add_argument( "--country-codes", type=str, From 08b321ff985718a1e5ed287ec8f310660876e0d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 17:38:21 +0200 Subject: [PATCH 45/49] Put the closure outside of the function --- oonidata/oonidata/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index 6bde6920..b6372cf1 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -94,8 +94,8 @@ def sync(args): file_entries = list(jsonl_in_range(args.country_codes, testnames, args.since, args.until)) with logging_redirect_tqdm(): + func = download_and_trim(args) with ChosenPool() as pool: - func = download_and_trim(args) list(tqdm(pool.imap_unordered(func, file_entries), total=len(file_entries))) def _parse_date_flag(date_str: str) -> dt.date: From eb976466c0b3de6ccd1b1f2f83c4e6f2dab13399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 17:45:59 +0200 Subject: [PATCH 46/49] Use a partial instead of closure to get process pool to work --- oonidata/oonidata/main.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index b6372cf1..f59aca32 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -1,7 +1,7 @@ import argparse import shutil from collections import namedtuple -from functools import singledispatch +from functools import singledispatch, partial import tempfile import os import gzip @@ -71,12 +71,10 @@ def trim_container(s3cachedir: pathlib.Path, fe: FileEntry, max_string_size: int raise -def download_and_trim(args): - def closure(fe): - mc = download_measurement_container(args.output_dir, fe) - if args.max_string_size: - trim_container(args.output_dir, fe, args.max_string_size) - return closure +def download_and_trim(fe, output_dir, max_string_size): + mc = download_measurement_container(output_dir, fe) + if max_string_size: + trim_container(output_dir, fe, max_string_size) def sync(args): ChosenPool = ThreadPool @@ -94,7 +92,7 @@ def sync(args): file_entries = list(jsonl_in_range(args.country_codes, testnames, args.since, args.until)) with logging_redirect_tqdm(): - func = download_and_trim(args) + func = partial(download_and_trim, output_dir=args.output_dir, max_string_size=args.max_string_size) with ChosenPool() as pool: list(tqdm(pool.imap_unordered(func, file_entries), total=len(file_entries))) From 590cf4ef78a1d7ef6acf3181eb87af51d6f3b7cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Wed, 6 Apr 2022 18:08:53 +0200 Subject: [PATCH 47/49] Update oonidata/oonidata/s3feeder.py Co-authored-by: Vinicius Fortuna --- oonidata/oonidata/s3feeder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oonidata/oonidata/s3feeder.py b/oonidata/oonidata/s3feeder.py index eb409ca4..17c5438f 100644 --- a/oonidata/oonidata/s3feeder.py +++ b/oonidata/oonidata/s3feeder.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ -Feeds reports from cans on public S3 bucke or local disk +Feeds reports from cans on public S3 bucket or local disk Explore bucket from CLI: AWS_PROFILE=ooni-data aws s3 ls s3://ooni-data/canned/2019-07-16/ From 150fa293fcfcc3358e2faf63ef8ec890c48684d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 12 Apr 2022 11:28:42 +0200 Subject: [PATCH 48/49] Add metadata for publication of pypi --- oonidata/LICENSE | 11 +++++++++++ oonidata/README.md | 21 +++++++++++++++++++++ oonidata/oonidata/main.py | 17 +++++------------ oonidata/setup.py | 18 ++++++++++++++++++ 4 files changed, 55 insertions(+), 12 deletions(-) create mode 100644 oonidata/LICENSE create mode 100644 oonidata/README.md diff --git a/oonidata/LICENSE b/oonidata/LICENSE new file mode 100644 index 00000000..2b969e99 --- /dev/null +++ b/oonidata/LICENSE @@ -0,0 +1,11 @@ +Copyright 2022 Open Observatory of Network Interference (OONI) + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/oonidata/README.md b/oonidata/README.md new file mode 100644 index 00000000..b39adf01 --- /dev/null +++ b/oonidata/README.md @@ -0,0 +1,21 @@ +# OONI Data + +**Attention** +This tool is currently in alpha stage. The CLI API is subject to change and you +should be careful to rely on it for production usage. + +## What is this? + +OONI data is a tool for interacting with raw OONI measurements. It supports +downloading raw network measurement data in batch. + +For the specifications of the base data formats see: https://github.com/ooni/spec/tree/master/data-formats + +For the specifications of each of the tests see: https://github.com/ooni/spec/tree/master/nettests + +## Example usage + +To download raw Web Connectivity measurements for a given country and time range, use the following: +``` +oonidata sync --since 2022-02-23 --until 2022-03-17 --country-codes IT --test-names web_connectivity --output-dir ./oonidatastore/ +``` diff --git a/oonidata/oonidata/main.py b/oonidata/oonidata/main.py index f59aca32..dcc22c66 100644 --- a/oonidata/oonidata/main.py +++ b/oonidata/oonidata/main.py @@ -15,7 +15,6 @@ import ujson -from multiprocessing import Pool from multiprocessing.pool import ThreadPool from tqdm import tqdm @@ -76,12 +75,8 @@ def download_and_trim(fe, output_dir, max_string_size): if max_string_size: trim_container(output_dir, fe, max_string_size) -def sync(args): - ChosenPool = ThreadPool - if args.use_process: - print("Using process pool") - ChosenPool = Pool +def sync(args): testnames = [] if args.test_names: # Replace _ with a - @@ -92,10 +87,12 @@ def sync(args): file_entries = list(jsonl_in_range(args.country_codes, testnames, args.since, args.until)) with logging_redirect_tqdm(): - func = partial(download_and_trim, output_dir=args.output_dir, max_string_size=args.max_string_size) - with ChosenPool() as pool: + func = partial(download_and_trim, output_dir=args.output_dir, + max_string_size=args.max_string_size) + with ThreadPool() as pool: list(tqdm(pool.imap_unordered(func, file_entries), total=len(file_entries))) + def _parse_date_flag(date_str: str) -> dt.date: return dt.datetime.strptime(date_str, "%Y-%m-%d").date() @@ -107,10 +104,6 @@ def main(): subparsers = parser.add_subparsers() parser_sync = subparsers.add_parser("sync", help="Sync OONI measurements") - parser_sync.add_argument( - "--use-process", - action="store_true" - ) parser_sync.add_argument( "--country-codes", type=str, diff --git a/oonidata/setup.py b/oonidata/setup.py index 4571a8f1..7c03bdff 100644 --- a/oonidata/setup.py +++ b/oonidata/setup.py @@ -3,8 +3,17 @@ from setuptools import setup +with open("README.md", "r", encoding="utf-8") as in_file: + long_description = in_file.read() + setup( name="oonidata", + version="0.0.1", + author="Open Observatory of Network Interference (OONI)", + author_email="contact@openobservatory.org", + description="Interact with OONI network measurement data", + long_description=long_description, + long_description_content_type="text/markdown", python_requires=">=3.7.0", packages=["oonidata"], entry_points={"console_scripts": [ @@ -17,5 +26,14 @@ "tqdm", "lz4" ], + project_urls={ + "Bug Tracker": "https://github.com/ooni/backend/issues" + }, + classifiers=[ + "Development Status :: 2 - Pre-Alpha", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent" + ], zip_safe=False, ) From 3543ae67760026870e14c16be454a5cc4f19131e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arturo=20Filast=C3=B2?= Date: Tue, 12 Apr 2022 11:30:08 +0200 Subject: [PATCH 49/49] Add .gitignore --- oonidata/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 oonidata/.gitignore diff --git a/oonidata/.gitignore b/oonidata/.gitignore new file mode 100644 index 00000000..00211148 --- /dev/null +++ b/oonidata/.gitignore @@ -0,0 +1,2 @@ +dist/ +oonidata.egg-info/