cykl · naxhh · Feb 3, 2020 · Feb 3, 2020
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,24 @@
+FROM ubuntu:bionic
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    rtmpdump \
+    ffmpeg \
+    libavcodec-extra57 \
+    swftools \
+    python3 \
+    python3-pip \
+    python3-setuptools \
+  && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /code
+
+COPY . /code
+
+RUN pip3 install wheel
+RUN pip3 install -r requirements.txt
+RUN python3 setup.py install
+
+WORKDIR /app
+
+ENTRYPOINT ["infoqscraper"]
diff --git a/infoqscraper/scrap.py b/infoqscraper/scrap.py
@@ -87,22 +87,19 @@ def _fetch(self):
 
     @property
     def metadata(self):
-        def get_title(pres_div):
-            return pres_div.find('h1', class_="general").div.get_text().strip()
+        def get_title(document):
+            return document.find('h1').get_text().strip()
 
-        def get_date(pres_div):
-            strings = ''.join(pres_div.find('span', class_='author_general').strings)
-            match = re.search('on[\n ]+(.*\d{4})', strings)
-            if match:
-                return datetime.datetime.strptime(match.group(1), "%b %d, %Y")
-            else:
-                raise Exception("Failed to extract date (markup changed?)")
+        def get_date(document):
+            string_date = document.find('p', class_='date').get_text().strip()
 
-        def get_author(pres_div):
-            return pres_div.find('span', class_='authors-list').find('a').get_text().strip()
+            return datetime.datetime.strptime(string_date, "%b %d, %Y")
 
-        def get_timecodes(pres_div):
-            for script in pres_div.find_all('script'):
+        def get_author(document):
+            return document.find('a', 'author__link').get_text()
+
+        def get_timecodes(document):
+            for script in document.find_all('script'):
                 mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text())
                 if mo:
                     return [int(tc) for tc in mo.group(1).split(',')]
@@ -128,14 +125,14 @@ def get_video(pres_div):
                     else:
                         raise Exception("Unsupported video type: %s" % path)
 
-        def get_bio(div):
-            return div.find('p', id="biotext").get_text(strip=True)
+        def get_bio(document):
+            return document.find('div', class_='bio').find('p').get_text().strip()
 
-        def get_summary(div):
-            return "".join(div.find('p', id="summary").get_text("|", strip=True).split("|")[1:])
+        def get_summary(document):
+            return document.find('div', class_='summary').find('p').get_text().strip()
 
-        def get_about(div):
-            return div.find('p', id="conference").get_text(strip=True)
+        def get_about(document):
+            return document.find('div', class_='about-conference').find('p').get_text().strip()
 
         def get_demo_timings(pres_div):
             for script in pres_div.find_all('script'):
@@ -165,22 +162,23 @@ def add_mp3_if_exist(metadata, bc3):
                     metadata['mp3'] = client.get_url(a['href'])
 
         if not hasattr(self, "_metadata"):
-            pres_div = self.soup.find('div', class_='presentation_full')
+            pres_div = self.soup.find('div', class_='container__inner')
             metadata = {
                 'url': client.get_url("/presentations/" + self.id),
-                'title': get_title(pres_div),
-                'date' : get_date(pres_div),
-                'auth' : get_author(pres_div),
+                'title': get_title(self.soup),
+                'date' : get_date(self.soup),
+                'auth' : get_author(self.soup),
                 'timecodes': get_timecodes(self.soup),
                 'demo_timings': get_demo_timings(self.soup),
                 'slides': get_slides(self.soup),
                 'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"),
                 'video_path': get_video(self.soup),
-                'bio':        get_bio(pres_div),
-                'summary':    get_summary(pres_div),
-                'about':      get_about(pres_div),
+                'bio':        get_bio(self.soup),
+                'summary':    get_summary(self.soup),
+                'about':      get_about(self.soup),
 
                 }
+
             add_mp3_if_exist(metadata, pres_div)
             add_pdf_if_exist(metadata, pres_div)