diff --git a/docker-compose.yml b/docker-compose.yml index 0f611e5..73efba7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,11 +13,28 @@ services: image: spokanetech:latest build: context: . + command: [ + "python", + "-m", + "celery", + "--workdir", + "./src", + "-A", + "spokanetech.celery", + "worker", + "-B", + "-l", + "INFO", + "--events" + ] container_name: worker ports: - "5555:5555" env_file: - .env + environment: + SPOKANE_TECH_DEV: false + CELERY_BROKER_URL: "redis://redis:6379/0" redis: image: redis:7.2 diff --git a/src/spokanetech/settings.py b/src/spokanetech/settings.py index 29ab9b6..be3fc47 100644 --- a/src/spokanetech/settings.py +++ b/src/spokanetech/settings.py @@ -248,6 +248,7 @@ CELERY_TASK_ACKS_LATE = True CELERY_TIMEZONE = TIME_ZONE CELERY_BEAT_SCHEDULER = "django_celery_beat.schedulers:DatabaseScheduler" +CELERY_LATE_ACK = True # Discord diff --git a/src/web/migrations/0015_remove_eventbriteorganization_eventbrite_id.py b/src/web/migrations/0015_remove_eventbriteorganization_eventbrite_id.py new file mode 100644 index 0000000..bc5777e --- /dev/null +++ b/src/web/migrations/0015_remove_eventbriteorganization_eventbrite_id.py @@ -0,0 +1,17 @@ +# Generated by Django 5.0.8 on 2024-09-11 05:09 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('web', '0014_event_image_techgroup_image'), + ] + + operations = [ + migrations.RemoveField( + model_name='eventbriteorganization', + name='eventbrite_id', + ), + ] diff --git a/src/web/models.py b/src/web/models.py index 2cf4911..066e017 100644 --- a/src/web/models.py +++ b/src/web/models.py @@ -115,4 +115,3 @@ def get_absolute_url(self) -> str: class EventbriteOrganization(models.Model): tech_group = models.ForeignKey(TechGroup, on_delete=models.CASCADE) url = models.URLField() - eventbrite_id = models.CharField(max_length=256) diff --git a/src/web/scrapers.py b/src/web/scrapers.py index 8476185..8054faf 100644 --- a/src/web/scrapers.py +++ b/src/web/scrapers.py @@ -3,13 +3,14 @@ import pathlib import re import urllib.parse +import zoneinfo from datetime import datetime, timedelta -from typing import Any, Protocol, TypeAlias, TypeVar +from typing import Any, Callable, Protocol, TypeAlias, TypeVar import eventbrite.access_methods import requests -import zoneinfo -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup +from bs4.element import Tag from django.conf import settings from django.utils import timezone from eventbrite import Eventbrite @@ -26,11 +27,16 @@ def get_venue(self, id, **data): def get_event_description(self, id, **data): - return self.get("/events/{0}/description//".format(id), data=data) + return self.get("/events/{0}/description/".format(id), data=data) + + +def get_events_for_series(self, id, **data): + return self.get("/series/{0}/events/".format(id), data=data) setattr(eventbrite.access_methods.AccessMethodsMixin, "get_venue", get_venue) setattr(eventbrite.access_methods.AccessMethodsMixin, "get_event_description", get_event_description) +setattr(eventbrite.access_methods.AccessMethodsMixin, "get_events_for_series", get_events_for_series) class Scraper(Protocol[ST]): @@ -174,7 +180,7 @@ def _parse_date_time(self, soup: BeautifulSoup) -> datetime: return datetime.fromisoformat(soup.find_all("time")[0]["datetime"]) def _parse_duration(self, soup: BeautifulSoup) -> timedelta: - time: Tag = soup.find_all("time")[0] + time = soup.find_all("time")[0] matches = self.DURATION_PATTERN.findall(time.text) if not matches: raise ValueError("Could not find duration from:", time.text) @@ -201,18 +207,54 @@ def _parse_tags(self, soup: BeautifulSoup) -> list[models.Tag]: class EventbriteScraper(Scraper[list[EventScraperResult]]): + ORGANIZATION_ID_PATTERN = re.compile(r"/o/[A-z-]+(\d+)") + EVENT_SERIES_ID_PATTERN = re.compile(r"/e/[A-z-]+(\d+)") + def __init__(self, api_token: str | None = None): self.client = Eventbrite(api_token or settings.EVENTBRITE_API_TOKEN) self._location_by_venue_id: dict[str, str] = {} - def scrape(self, organization_id: str) -> list[EventScraperResult]: - response = self.client.get_organizer_events( - organization_id, - status="live", - ) - events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in response["events"]] + def scrape(self, url: str) -> list[EventScraperResult]: + request_func, id = self.get_request_func(url) + request_func = functools.partial(request_func, id) + events = self.paginate_all(request_func, "events") + events_and_tags = [self.map_to_event(eventbrite_event) for eventbrite_event in events] return events_and_tags + def get_request_func(self, url: str) -> tuple[Callable[..., Any], int]: + """Parse the API request function and ID from the URL.""" + if matches := self.ORGANIZATION_ID_PATTERN.findall(url): + organization_id = matches[0] + return (functools.partial(self.client.get_organizer_events, status="live"), organization_id) + elif matches := self.EVENT_SERIES_ID_PATTERN.findall(url): + event_series_id = matches[0] + return (self.client.get_events_for_series, event_series_id) # type: ignore + else: + raise ValueError(f"invalid Eventbrite url: {url}") + + def paginate_all(self, request_func: Callable[..., Any], key: str) -> list: + """Iterate through all the pages of the request.""" + response = request_func() + self.check_response(response) + result = response[key] + if getattr(response, "is_paginated", False): + while response["pagination"]["has_more_items"]: + continuation = response["pagination"]["continuation"] + response = request_func(continuation=continuation) + self.check_response(response) + result = result + response[key] + return result + + def check_response(self, response: Any) -> None: + status_code: int = getattr(response, "status_code", 0) + if not status_code: + status_code = response["status_code"] + + if status_code >= 400: + raise ValueError( + f"Evenbrite scrape error: [{status_code}] {response["error"]}: {response["error_description"]}" + ) + def map_to_event(self, eventbrite_event: dict) -> tuple[models.Event, list[models.Tag]]: name = eventbrite_event["name"]["text"] start = datetime.fromisoformat(eventbrite_event["start"]["utc"]) diff --git a/src/web/services.py b/src/web/services.py index f0e295e..0371083 100644 --- a/src/web/services.py +++ b/src/web/services.py @@ -57,7 +57,7 @@ def save_events(self) -> None: now = timezone.localtime() for eventbrite_organization in models.EventbriteOrganization.objects.prefetch_related("tech_group"): tech_group = eventbrite_organization.tech_group - events_and_tags = self.events_scraper.scrape(eventbrite_organization.eventbrite_id) + events_and_tags = self.events_scraper.scrape(eventbrite_organization.url) for event, _ in events_and_tags: event.group = tech_group event.approved_at = now diff --git a/src/web/tasks.py b/src/web/tasks.py index b443b02..92578e3 100644 --- a/src/web/tasks.py +++ b/src/web/tasks.py @@ -18,8 +18,8 @@ def scrape_events_from_meetup(): def scrape_events_from_eventbrite(): """Scrape upcoming events from Eventbrite.""" events_scraper = scrapers.EventbriteScraper() - meetup_service = services.EventbriteService(events_scraper) - meetup_service.save_events() + eventbrite_service = services.EventbriteService(events_scraper) + eventbrite_service.save_events() @shared_task() diff --git a/src/web/tests/test_scrapers.py b/src/web/tests/test_scrapers.py index 54dfcf9..5806ccf 100644 --- a/src/web/tests/test_scrapers.py +++ b/src/web/tests/test_scrapers.py @@ -1,12 +1,13 @@ import pathlib from datetime import datetime, timedelta +from zoneinfo import ZoneInfo import freezegun -import responses import pytest +import responses from django.test import TestCase + from web import models, scrapers -from zoneinfo import ZoneInfo class TestMeetupHomepageScraper(TestCase): @@ -126,9 +127,9 @@ class TestEventbriteScraper(TestCase): To run them, set the `EVENTBRITE_API_TOKEN` envrionment variable. """ - def test_scraper(self): + def test_scraper_organization_id(self): scraper = scrapers.EventbriteScraper() - result = scraper.scrape("72020528223") + result = scraper.scrape("https://www.eventbrite.com/o/inch360-72020528223") actual: models.Event = result[0][0] assert actual.name == "Spring Cyber - Training Series" assert actual.description and actual.description.startswith( @@ -139,3 +140,8 @@ def test_scraper(self): assert actual.location == "2818 North Sullivan Road #Suite 100, Spokane Valley, WA 99216" assert actual.url == "https://www.eventbrite.com/e/spring-cyber-training-series-tickets-860181354587" assert actual.external_id == "860181354587" + + def test_scraper_event_series_id(self): + scraper = scrapers.EventbriteScraper() + result = scraper.scrape("https://www.eventbrite.com/e/cda-machine-learners-ai-ml-club-tickets-640757311367") + assert len(result) > 1