Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM ubuntu:bionic

RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
rtmpdump \
ffmpeg \
libavcodec-extra57 \
swftools \
python3 \
python3-pip \
python3-setuptools \
&& rm -rf /var/lib/apt/lists/*

WORKDIR /code

COPY . /code

RUN pip3 install wheel
RUN pip3 install -r requirements.txt
RUN python3 setup.py install

WORKDIR /app

ENTRYPOINT ["infoqscraper"]
50 changes: 24 additions & 26 deletions infoqscraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,22 +87,19 @@ def _fetch(self):

@property
def metadata(self):
def get_title(pres_div):
return pres_div.find('h1', class_="general").div.get_text().strip()
def get_title(document):
return document.find('h1').get_text().strip()

def get_date(pres_div):
strings = ''.join(pres_div.find('span', class_='author_general').strings)
match = re.search('on[\n ]+(.*\d{4})', strings)
if match:
return datetime.datetime.strptime(match.group(1), "%b %d, %Y")
else:
raise Exception("Failed to extract date (markup changed?)")
def get_date(document):
string_date = document.find('p', class_='date').get_text().strip()

def get_author(pres_div):
return pres_div.find('span', class_='authors-list').find('a').get_text().strip()
return datetime.datetime.strptime(string_date, "%b %d, %Y")

def get_timecodes(pres_div):
for script in pres_div.find_all('script'):
def get_author(document):
return document.find('a', 'author__link').get_text()

def get_timecodes(document):
for script in document.find_all('script'):
mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text())
if mo:
return [int(tc) for tc in mo.group(1).split(',')]
Expand All @@ -128,14 +125,14 @@ def get_video(pres_div):
else:
raise Exception("Unsupported video type: %s" % path)

def get_bio(div):
return div.find('p', id="biotext").get_text(strip=True)
def get_bio(document):
return document.find('div', class_='bio').find('p').get_text().strip()

def get_summary(div):
return "".join(div.find('p', id="summary").get_text("|", strip=True).split("|")[1:])
def get_summary(document):
return document.find('div', class_='summary').find('p').get_text().strip()

def get_about(div):
return div.find('p', id="conference").get_text(strip=True)
def get_about(document):
return document.find('div', class_='about-conference').find('p').get_text().strip()

def get_demo_timings(pres_div):
for script in pres_div.find_all('script'):
Expand Down Expand Up @@ -165,22 +162,23 @@ def add_mp3_if_exist(metadata, bc3):
metadata['mp3'] = client.get_url(a['href'])

if not hasattr(self, "_metadata"):
pres_div = self.soup.find('div', class_='presentation_full')
pres_div = self.soup.find('div', class_='container__inner')
metadata = {
'url': client.get_url("/presentations/" + self.id),
'title': get_title(pres_div),
'date' : get_date(pres_div),
'auth' : get_author(pres_div),
'title': get_title(self.soup),
'date' : get_date(self.soup),
'auth' : get_author(self.soup),
'timecodes': get_timecodes(self.soup),
'demo_timings': get_demo_timings(self.soup),
'slides': get_slides(self.soup),
'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"),
'video_path': get_video(self.soup),
'bio': get_bio(pres_div),
'summary': get_summary(pres_div),
'about': get_about(pres_div),
'bio': get_bio(self.soup),
'summary': get_summary(self.soup),
'about': get_about(self.soup),

}

add_mp3_if_exist(metadata, pres_div)
add_pdf_if_exist(metadata, pres_div)

Expand Down