From 6ec3366be6d8c574ca54088862867e8b456c500f Mon Sep 17 00:00:00 2001 From: Anand Chitipothu Date: Tue, 15 May 2012 20:34:03 +0530 Subject: [PATCH 01/28] Test to make sure arc file header is written just once. And it is failing now. --- warc/tests/test_arc.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index 11305e5..ae67124 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -153,6 +153,16 @@ def test_arc1_v1_writer_default_headers(): assert opfile.getvalue() == expected_value f.close() +def test_arc_v1_writer_write_headers(): + """Test to make sure header is written just once. + """ + f = StringIO.StringIO() + f.name = "sample.arc" + afile = arc.ARCFile(fileobj=f, version=1) + afile._write_header() + + # Make sure header is written only once + assert f.getvalue().count("filedesc://") == 1 def test_arc_v2_writer(): "Try writing records to an ARC V2 file. This is what API will feel like to a user of the library" From 7fd90e3375584b97dcb26f5ecec6312f0a080f8e Mon Sep 17 00:00:00 2001 From: Anand Chitipothu Date: Tue, 15 May 2012 20:35:06 +0530 Subject: [PATCH 02/28] Fixed the issue of writing duplicate file header in arc file. --- warc/arc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/warc/arc.py b/warc/arc.py index 5889587..1ffb17a 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -292,7 +292,8 @@ def _write_header(self): offset = str(self.fileobj.tell()), filename = fname) arc_file_header_record = ARCRecord(header, payload%self.file_headers) - self.write(arc_file_header_record) + arc_file_header_record.write_to(self.fileobj, self.version) + self.fileobj.write("\n") # record separator def write(self, arc_record): "Writes out the given arc record to the file" From 249953d2c1073c92f5a0ba30bbdb54a68b22d64a Mon Sep 17 00:00:00 2001 From: Anand Chitipothu Date: Tue, 15 May 2012 21:04:48 +0530 Subject: [PATCH 03/28] ARCFile: filename is specified should be used and should work even if filename is not specified. --- warc/arc.py | 9 ++++++++- warc/tests/test_arc.py | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/warc/arc.py b/warc/arc.py index 1ffb17a..3e74a57 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -253,6 +253,13 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ fileobj = __builtin__.open(filename, mode or "rb") self.fileobj = fileobj + self.filename = filename + if self.filename is None: + if hasattr(self.fileobj, "name"): + self.filename = self.fileobj.name + else: + self.filename = "" + if version != None and int(version) not in (1, 2): raise TypeError("ARC version has to be 1 or 2") self.version = version @@ -280,7 +287,7 @@ def _write_header(self): else: raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version) - fname = os.path.basename(self.fileobj.name) + fname = os.path.basename(self.filename) header = ARCHeader(url = "filedesc://%s"%fname, ip_address = self.file_headers['ip_address'], date = self.file_headers['date'], diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index ae67124..ed2521c 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -153,17 +153,6 @@ def test_arc1_v1_writer_default_headers(): assert opfile.getvalue() == expected_value f.close() -def test_arc_v1_writer_write_headers(): - """Test to make sure header is written just once. - """ - f = StringIO.StringIO() - f.name = "sample.arc" - afile = arc.ARCFile(fileobj=f, version=1) - afile._write_header() - - # Make sure header is written only once - assert f.getvalue().count("filedesc://") == 1 - def test_arc_v2_writer(): "Try writing records to an ARC V2 file. This is what API will feel like to a user of the library" now = "20120302193210" @@ -308,3 +297,31 @@ def test_arc_record_versions(): record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" + +class TestARCFile: + def test_write_headers(self): + """Test to make sure header is written just once. + """ + f = StringIO.StringIO() + f.name = "sample.arc" + afile = arc.ARCFile(fileobj=f, version=1) + afile._write_header() + + # Make sure header is written only once + assert f.getvalue().count("filedesc://") == 1 + + def test_filename(self): + """If filename is specified as argument to ARCFile, it should be used.""" + f = StringIO.StringIO() + afile = arc.ARCFile(fileobj=f, filename="sample.arc", version=1) + afile._write_header() + assert "sample.arc" in f.getvalue() + + def test_no_filename(self): + """should be able to write ARCFile even if there is no filename.""" + f = StringIO.StringIO() + afile = arc.ARCFile(fileobj=f, version=1) + afile._write_header() + # filename should be empty + assert f.getvalue().startswith("filedesc:// ") + From c5d1dfa530bfcdfd83e8914352d233b661d4aa7a Mon Sep 17 00:00:00 2001 From: Anand Chitipothu Date: Tue, 15 May 2012 21:13:10 +0530 Subject: [PATCH 04/28] bump version to 0.2.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 23a2f87..3daa4f0 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="warc", - version="0.2.0", + version="0.2.1", description="Python library to work with ARC and WARC files", long_description=open('Readme.rst').read(), license='GPLv2', From 9297ded00b4ec398e0d7110f7a679e0118e73ea8 Mon Sep 17 00:00:00 2001 From: Ryan Chartier Date: Sat, 15 Nov 2014 13:26:18 -0700 Subject: [PATCH 05/28] Added some bugfixes Manual addition of some bugfixes found in upstream issues and pull requests. --- setup.py | 2 +- warc/arc.py | 28 +++++++++++++++++++++++++++- warc/warc.py | 2 +- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 3daa4f0..96d3f7c 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 'Operating System :: OS Independent', 'Programming Language :: Python', ], diff --git a/warc/arc.py b/warc/arc.py index 3e74a57..fc051ea 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -251,6 +251,14 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ """ if fileobj is None: fileobj = __builtin__.open(filename, mode or "rb") + mode = fileobj.mode + # initialize compress based on filename, if not already specified + if compress is None and filename and filename.endswith(".gz"): + compress = True + + if compress: + fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) + self.fileobj = fileobj self.filename = filename @@ -266,6 +274,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ self.file_headers = file_headers self.header_written = False self.header_read = False + self.file_meta = '' def _write_header(self): @@ -319,7 +328,6 @@ def _read_file_header(self): payload1 = self.fileobj.readline() payload2 = self.fileobj.readline() version, reserved, organisation = payload1.split(None, 2) - self.fileobj.readline() # Lose the separator newline self.header_read = True # print "--------------------------------------------------" # print header,"\n", payload1, "\n", payload2,"\n" @@ -342,6 +350,15 @@ def _read_file_header(self): else: raise IOError("Unknown ARC version '%s'"%version) + current = len(payload1) + len(payload2) + self.file_meta = '' + while current < int(length): + line = self.fileobj.readline() + current = current + len(line) + self.file_meta = self.file_meta + line + self.fileobj.readline() # Lose the separator newline + + def _read_arc_record(self): "Reads out an arc record, formats it and returns it" #XXX:Noufal Stream payload here rather than just read it @@ -355,6 +372,15 @@ def _read_arc_record(self): while header and header.strip() == "": header = self.fileobj.readline() + #JG: this block stops the header parser / reader + #from getting caught on the XML lump + #that can appear in ARC files + if header.startswith("\n"): + header = self.fileobj.readline() + header = self.fileobj.readline() + header = self.fileobj.readline() + if header == "": return None diff --git a/warc/warc.py b/warc/warc.py index 0c762a6..a41562f 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -237,7 +237,7 @@ def from_response(response): headers = { "WARC-Type": "response", - "WARC-Target-URI": response.request.full_url.encode('utf-8') + "WARC-Target-URI": response.request.url.encode('utf-8') } return WARCRecord(payload=payload, headers=headers) From 09d0e3153e26fd2fd0cb3861e1cdca19e4d3f264 Mon Sep 17 00:00:00 2001 From: Ryan Chartier Date: Sat, 15 Nov 2014 13:35:52 -0700 Subject: [PATCH 06/28] Ran 2to3, fixed CaseInsensitiveDict Almost certainly broken. --- docs/conf.py | 12 ++++++------ warc/arc.py | 12 ++++++------ warc/gzip2.py | 4 ++-- warc/tests/test_arc.py | 36 ++++++++++++++++++------------------ warc/tests/test_utils.py | 2 +- warc/tests/test_warc.py | 2 +- warc/utils.py | 18 +++++++++++------- warc/warc.py | 18 +++++++++--------- 8 files changed, 54 insertions(+), 50 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 4469a18..a59b35c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,8 +40,8 @@ master_doc = 'index' # General information about the project. -project = u'warc' -copyright = u'2012, Internet Archive' +project = 'warc' +copyright = '2012, Internet Archive' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -178,8 +178,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'warc.tex', u'WARC Documentation', - u'Internet Archive', 'manual'), + ('index', 'warc.tex', 'WARC Documentation', + 'Internet Archive', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -211,6 +211,6 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'warc', u'WARC Documentation', - [u'Internet Archive'], 1) + ('index', 'warc', 'WARC Documentation', + ['Internet Archive'], 1) ] diff --git a/warc/arc.py b/warc/arc.py index fc051ea..35830f9 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -4,11 +4,11 @@ :copyright: (c) 2012 Internet Archive """ -import __builtin__ +import builtins import datetime import os import re -import StringIO +import io import warnings from .utils import CaseInsensitiveDict @@ -135,7 +135,7 @@ def length(self): return int(self["length"]) def __str__(self): - f = StringIO.StringIO() + f = io.StringIO() self.write_to(f) return f.getvalue() @@ -144,7 +144,7 @@ def __repr__(self): for i in "url ip_address date content_typeresult_code checksum location offset filename length".split(): if hasattr(self,i): f[i] = getattr(self, i) - s = ['%s = "%s"'%(k, v) for k,v in f.iteritems()] + s = ['%s = "%s"'%(k, v) for k,v in f.items()] s = ", ".join(s) return ""%s @@ -200,7 +200,7 @@ def __setitem__(self, name, value): def __str__(self): - f = StringIO.StringIO() + f = io.StringIO() self.write_to(f) return f.getvalue() @@ -250,7 +250,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ """ if fileobj is None: - fileobj = __builtin__.open(filename, mode or "rb") + fileobj = builtins.open(filename, mode or "rb") mode = fileobj.mode # initialize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): diff --git a/warc/gzip2.py b/warc/gzip2.py index fcd6b48..c4af5f4 100644 --- a/warc/gzip2.py +++ b/warc/gzip2.py @@ -49,7 +49,7 @@ def close_member(self): self.fileobj.write(self.compress.flush()) write32u(self.fileobj, self.crc) # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffffL) + write32u(self.fileobj, self.size & 0xffffffff) self.size = 0 self.compress = zlib.compressobj(9, zlib.DEFLATED, @@ -113,7 +113,7 @@ def write_member(self, data): The data can be a string, an iterator that gives strings or a file-like object. """ - if isinstance(data, basestring): + if isinstance(data, str): self.write(data) else: for text in data: diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index ed2521c..65648b6 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -1,6 +1,6 @@ import datetime import hashlib -import StringIO +import io from .. import arc @@ -49,7 +49,7 @@ def test_arc_v1_header_creation(): location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = io.StringIO() header.write_to(f, 1) header_v1_string = f.getvalue() assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500" @@ -67,7 +67,7 @@ def test_arc_v2_header_creation(): location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = io.StringIO() header.write_to(f) header_v2_string = f.getvalue() assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500" @@ -86,7 +86,7 @@ def test_arc_v1_record_creation(): offset = "300", filename = "sample.arc.gz") record_v1 = arc.ARCRecord(header, "BlahBlah") - f = StringIO.StringIO() + f = io.StringIO() record_v1.write_to(f, 1) record_v1_string = f.getvalue() assert record_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" @@ -104,7 +104,7 @@ def test_arc_v2_record_creation(): offset = "300", filename = "sample.arc.gz") record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header) - f = StringIO.StringIO() + f = io.StringIO() record_v2.write_to(f) record_v2_string = f.getvalue() assert record_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" @@ -116,7 +116,7 @@ def test_arc_v1_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) @@ -137,7 +137,7 @@ def test_arc1_v1_writer_default_headers(): now = datetime.datetime(year = 2012, month = 3, day = 2, hour = 19, minute = 32, second = 10) file_headers = dict(date = now) - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) @@ -160,7 +160,7 @@ def test_arc_v2_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, file_headers = file_headers) @@ -182,8 +182,8 @@ def test_arc_v2_writer(): def test_arc_reader_guess_version(): "Make sure that the ARCFile object automatically detects the file version" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") + v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") arc_v1 = arc.ARCFile(fileobj = v1) arc_v2 = arc.ARCFile(fileobj = v2) @@ -196,7 +196,7 @@ def test_arc_reader_guess_version(): def test_arc_reader_read_file_headers(): "Make sure that the parser is reading file headers properly" - ip = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + ip = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") arc_file = arc.ARCFile(fileobj = ip) arc_file.read() arc_file.file_headers['ip_address'] == "127.0.0.1" @@ -206,7 +206,7 @@ def test_arc_reader_read_file_headers(): def test_arc_reader_v1(): "Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") + v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") arc_file = arc.ARCFile(fileobj = v1) r1 = arc_file.read() @@ -229,7 +229,7 @@ def test_arc_reader_v1(): def test_arc_reader_v2(): "Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)" - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") + v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") arc_file = arc.ARCFile(fileobj = v2) r1, r2 = list(arc_file) @@ -287,12 +287,12 @@ def test_arc_record_versions(): filename = "sample.arc.gz") record_1 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 1) record_2 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 2) - f = StringIO.StringIO() + f = io.StringIO() record_1.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" - f = StringIO.StringIO() + f = io.StringIO() record_2.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" @@ -302,7 +302,7 @@ class TestARCFile: def test_write_headers(self): """Test to make sure header is written just once. """ - f = StringIO.StringIO() + f = io.StringIO() f.name = "sample.arc" afile = arc.ARCFile(fileobj=f, version=1) afile._write_header() @@ -312,14 +312,14 @@ def test_write_headers(self): def test_filename(self): """If filename is specified as argument to ARCFile, it should be used.""" - f = StringIO.StringIO() + f = io.StringIO() afile = arc.ARCFile(fileobj=f, filename="sample.arc", version=1) afile._write_header() assert "sample.arc" in f.getvalue() def test_no_filename(self): """should be able to write ARCFile even if there is no filename.""" - f = StringIO.StringIO() + f = io.StringIO() afile = arc.ARCFile(fileobj=f, version=1) afile._write_header() # filename should be empty diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py index c155e6e..f774940 100644 --- a/warc/tests/test_utils.py +++ b/warc/tests/test_utils.py @@ -1,5 +1,5 @@ from ..utils import FilePart, CaseInsensitiveDict -from cStringIO import StringIO +from io import StringIO class TestCaseInsensitiveDict: def test_all(self): diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 92545ba..6dd4802 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,6 +1,6 @@ from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile -from StringIO import StringIO +from io import StringIO class TestWARCHeader: def test_attrs(self): diff --git a/warc/utils.py b/warc/utils.py index 8620e8e..4e72669 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -7,9 +7,10 @@ :copyright: (c) 2012 Internet Archive """ -from UserDict import DictMixin +from collections import MutableMapping -class CaseInsensitiveDict(DictMixin): + +class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. >>> d = CaseInsensitiveDict(foo=1, Bar=2) @@ -23,9 +24,9 @@ class CaseInsensitiveDict(DictMixin): >>> d.keys() ["foo", "bar"] """ - def __init__(self, mapping=None, **kwargs): + def __init__(self, *args, **kwargs): self._d = {} - self.update(mapping, **kwargs) + self.update(dict(*args, **kwargs)) def __setitem__(self, name, value): self._d[name.lower()] = value @@ -39,8 +40,11 @@ def __delitem__(self, name): def __eq__(self, other): return isinstance(other, CaseInsensitiveDict) and other._d == self._d - def keys(self): - return self._d.keys() + def __iter__(self): + return iter(self._d) + + def __len__(self): + return len(self._d) class FilePart: """File interface over a part of file. @@ -67,7 +71,7 @@ def _read(self, size): else: size = min(size, self.length - self.offset - len(self.buf)) content = self.buf + self.fileobj.read(size) - self.buf = "" + self.buf = b"" self.offset += len(content) return content diff --git a/warc/warc.py b/warc/warc.py index a41562f..d43cc56 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -7,12 +7,12 @@ :copyright: (c) 2012 Internet Archive """ -import __builtin__ +import builtins import datetime import uuid import logging import re -from cStringIO import StringIO +from io import StringIO import hashlib from . import gzip2 @@ -68,7 +68,7 @@ class WARCHeader(CaseInsensitiveDict): def __init__(self, headers, defaults=False): self.version = "WARC/1.0" - CaseInsensitiveDict.__init__(self, headers) + super().__init__(headers) if defaults: self.init_defaults() @@ -91,7 +91,7 @@ def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ f.write(self.version + "\r\n") - for name, value in self.items(): + for name, value in list(self.items()): name = name.title() # Use standard forms for commonly used patterns name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI") @@ -111,7 +111,7 @@ def content_length(self): @property def type(self): """The value of WARC-Type header.""" - return self.get('WARC-Type') + return self.['WARC-Type'] @property def record_id(self): @@ -244,7 +244,7 @@ def from_response(response): class WARCFile: def __init__(self, filename=None, mode=None, fileobj=None, compress=None): if fileobj is None: - fileobj = __builtin__.open(filename, mode or "rb") + fileobj = builtins.open(filename, mode or "rb") mode = fileobj.mode # initiaize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): @@ -322,7 +322,7 @@ def __init__(self, fileobj): self.current_payload = None def read_header(self, fileobj): - version_line = fileobj.readline() + version_line = fileobj.readline().decode("utf-8") if not version_line: return None @@ -335,7 +335,7 @@ def read_header(self, fileobj): headers = {} while True: - line = fileobj.readline() + line = fileobj.readline().decode("utf-8") if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) @@ -346,7 +346,7 @@ def read_header(self, fileobj): return WARCHeader(headers) def expect(self, fileobj, expected_line, message=None): - line = fileobj.readline() + line = fileobj.readline().decode("utf-8") if line != expected_line: message = message or "Expected %r, found %r" % (expected_line, line) raise IOError(message) From bd83608764fb4954776d9c7d23634497e038359c Mon Sep 17 00:00:00 2001 From: recrm Date: Fri, 21 Nov 2014 17:19:57 -0700 Subject: [PATCH 07/28] Update for python3 Numerous small changes. Update for python3, attempt to remove gzip2 and use standard library instead. Began creating small tool for HTTP parsing. Probobly utterly broken. --- warc/arc.py | 10 ++++- warc/gzip2.py | 2 +- warc/utils.py | 105 +++++++++++++++++++++++++++++++++++++++++++------- warc/warc.py | 41 ++++++++------------ 4 files changed, 117 insertions(+), 41 deletions(-) diff --git a/warc/arc.py b/warc/arc.py index 35830f9..c27934d 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -10,6 +10,7 @@ import re import io import warnings +import gzip from .utils import CaseInsensitiveDict @@ -257,7 +258,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ compress = True if compress: - fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) + fileobj = gzip.open(fileobj, mode) self.fileobj = fileobj @@ -276,7 +277,12 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ self.header_read = False self.file_meta = '' - + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + def _write_header(self): "Writes out an ARC header" if "org" not in self.file_headers: diff --git a/warc/gzip2.py b/warc/gzip2.py index c4af5f4..bc72c18 100644 --- a/warc/gzip2.py +++ b/warc/gzip2.py @@ -102,7 +102,7 @@ def read_member(self): try: # Read one byte to move to the next member BaseGzipFile._read(self, 1) - assert self._new_member is False +# assert self._new_member is False except EOFError: return None diff --git a/warc/utils.py b/warc/utils.py index 4e72669..b625275 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -56,22 +56,19 @@ def __init__(self, fileobj, length): self.fileobj = fileobj self.length = length self.offset = 0 - self.buf = "" + self.buf = b'' - def read(self, size=-1): + def read(self, size): if size == -1: - return self._read(self.length) - else: - return self._read(size) + size = self.length - def _read(self, size): if len(self.buf) >= size: content = self.buf[:size] self.buf = self.buf[size:] else: size = min(size, self.length - self.offset - len(self.buf)) content = self.buf + self.fileobj.read(size) - self.buf = b"" + self.buf = b'' self.offset += len(content) return content @@ -79,22 +76,102 @@ def _unread(self, content): self.buf = content + self.buf self.offset -= len(content) - def readline(self): + def readline(self, size=1024): chunks = [] - chunk = self._read(1024) - while chunk and "\n" not in chunk: + chunk = self.read(size) + while chunk and b"\n" not in chunk: chunks.append(chunk) - chunk = self._read(1024) + chunk = self.read(size) - if "\n" in chunk: - index = chunk.index("\n") + if b"\n" in chunk: + index = chunk.index(b"\n") self._unread(chunk[index+1:]) chunk = chunk[:index+1] chunks.append(chunk) - return "".join(chunks) + return b"".join(chunks) def __iter__(self): line = self.readline() while line: yield line line = self.readline() + +class HTTPObject(CaseInsensitiveDict): + """Small object to help with parsing HTTP warc entries""" + def __init__(self, file): + + #Parse version line + id_Str = file.readline().decode("iso-8859-1") + words = id_str.split() + command = path = status = error = version = None + #If length is not 3 it is a bad version line. + if len(words) == 3: + if words[1].isdigit(): + version, error, status = words + else: + command, path, version = words + + self.id = { + "raw": id_Str, + "command": command, + "path": path, + "status": status, + "error": error, + "version": version, + } + + self.header = parse_headers(request_file) + super().__init__(self.header) + + self.payload = request_file + + def _parseversion(self): + + @property + def vline(self): + return self.id["raw"] + + @property + def command(self): + return self.id["command"] + + @property + def path(self): + return self.id["path"] + + @property + def status(self): + return self.id["status"] + + @property + def error(self): + value = self.id["error"] + return int(value) if value else value + + @property + def version(self): + return self.id["version"] + + @property + def content_type(self): + return self.header.get_content_type() + + @property + def charset(self): + return self.header.get_content_charset() + + def read(self, size=1024): + encoding = self.header.get("Transfer-Encoding", "None") + if encoding == "chunked": + found = b'' + length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) + while length > 0: + found += self.payload.read(length) + self.payload.readline() + length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) + + return found + + else: + length = self.header.get("Content-Length", -1) + return self.payload.read(length) diff --git a/warc/warc.py b/warc/warc.py index d43cc56..5f5f67b 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -7,6 +7,7 @@ :copyright: (c) 2012 Internet Archive """ +import gzip import builtins import datetime import uuid @@ -15,7 +16,6 @@ from io import StringIO import hashlib -from . import gzip2 from .utils import CaseInsensitiveDict, FilePart class WARCHeader(CaseInsensitiveDict): @@ -111,7 +111,7 @@ def content_length(self): @property def type(self): """The value of WARC-Type header.""" - return self.['WARC-Type'] + return self['WARC-Type'] @property def record_id(self): @@ -251,11 +251,20 @@ def __init__(self, filename=None, mode=None, fileobj=None, compress=None): compress = True if compress: - fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) + fileobj = gzip.open(fileobj, mode) self.fileobj = fileobj self._reader = None - + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def __iter__(self): + return iter(self.reader) + @property def reader(self): if self._reader is None: @@ -266,17 +275,11 @@ def write_record(self, warc_record): """Adds a warc record to this WARC file. """ warc_record.write_to(self.fileobj) - # Each warc record is written as separate member in the gzip file - # so that each record can be read independetly. - if isinstance(self.fileobj, gzip2.GzipFile): - self.fileobj.close_member() def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - - def __iter__(self): - return iter(self.reader) + def close(self): self.fileobj.close() @@ -304,13 +307,9 @@ def browse(self): offset = next_offset def tell(self): - """Returns the file offset. If this is a compressed file, then the - offset in the compressed file is returned. + """Returns the file offset. """ - if isinstance(self.fileobj, gzip2.GzipFile): - return self.fileobj.fileobj.tell() - else: - return self.fileobj.tell() + return self.fileobj.tell() class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") @@ -362,13 +361,7 @@ def finish_reading_current_record(self): def read_record(self): self.finish_reading_current_record() - - if isinstance(self.fileobj, gzip2.GzipFile): - fileobj = self.fileobj.read_member() - if fileobj is None: - return None - else: - fileobj = self.fileobj + fileobj = self.fileobj header = self.read_header(fileobj) if header is None: From 9bd3ff85095b881082793fd0149e22e94214386c Mon Sep 17 00:00:00 2001 From: recrm Date: Fri, 21 Nov 2014 17:28:20 -0700 Subject: [PATCH 08/28] Quick update of readme --- Readme.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Readme.rst b/Readme.rst index b918dc5..51e9f6b 100644 --- a/Readme.rst +++ b/Readme.rst @@ -1,5 +1,7 @@ -warc: Python library to work with WARC files -============================================ +warc3: Python3 library to work with WARC files +============================================= + +(Note: This is a fork of the original (now dead) warc repository. This project is still in transition and probobly not stable.) .. image:: https://secure.travis-ci.org/anandology/warc.png?branch=master :alt: build status @@ -12,9 +14,9 @@ http://bibnum.bnf.fr/WARC/ This `warc` library makes it very easy to work with WARC files.:: import warc - f = warc.open("test.warc") - for record in f: - print record['WARC-Target-URI'], record['Content-Length'] + with warc.open("test.warc") as f: + for record in f: + print record['WARC-Target-URI'], record['Content-Length'] Documentation ------------- From dae393862ba35f0df1e80e4e543a566e74834e89 Mon Sep 17 00:00:00 2001 From: recrm Date: Wed, 3 Dec 2014 13:08:23 -0700 Subject: [PATCH 09/28] Updated testing, improved HTTPObject. Numerous bug fixes in warc --- warc/__init__.py | 1 + warc/arc.py | 2 +- warc/tests/test_arc.py | 10 ++- warc/tests/test_common.py | 35 +---------- warc/tests/test_utils.py | 54 ++++++++++------- warc/tests/test_warc.py | 46 ++++++-------- warc/utils.py | 124 +++++++++++++++++++++++++------------- warc/warc.py | 60 +++++++++++------- 8 files changed, 177 insertions(+), 155 deletions(-) diff --git a/warc/__init__.py b/warc/__init__.py index 71392bc..098af4c 100644 --- a/warc/__init__.py +++ b/warc/__init__.py @@ -9,6 +9,7 @@ from .arc import ARCFile, ARCRecord, ARCHeader from .warc import WARCFile, WARCRecord, WARCHeader, WARCReader +from .utils import HTTPObject def detect_format(filename): """Tries to figure out the type of the file. Return 'warc' for diff --git a/warc/arc.py b/warc/arc.py index c27934d..30c20c9 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -207,7 +207,7 @@ def __str__(self): class ARCFile(object): - def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}): + def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=False): """ Initialises a file like object that can be used to read or write Arc files. Works for both version 1 or version 2. diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index 65648b6..6b6f16e 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -4,12 +4,10 @@ from .. import arc -import pytest - -def test_init_arc_header(): - "Make sure Header can be initialise only with expected fields" - with pytest.raises(TypeError): - arc.ARCHeader(test="1234") +#def test_init_arc_header(): +# "Make sure Header can be initialise only with expected fields" +# with pytest.raises(TypeError): +# arc.ARCHeader(test="1234") def test_arc_header_attributes(): "Make sure that ARC1 header fields are accessible as attributes. Double check for attributes that are converted for convenience (e.g. date and length)" diff --git a/warc/tests/test_common.py b/warc/tests/test_common.py index d2c2353..7981047 100644 --- a/warc/tests/test_common.py +++ b/warc/tests/test_common.py @@ -1,10 +1,8 @@ -from .. import open as libopen -from .. import WARCFile, ARCFile +from ..__init__ import open as libopen +from ..warc import WARCFile import os -import pytest - def test_open_warc_file(): "Test opening a WARC file" @@ -13,32 +11,3 @@ def test_open_warc_file(): f.close() os.unlink("foo.warc") - -def test_open_arc_file(): - "Test opening an ARC file" - - f = libopen("foo.arc","wb") - assert isinstance(f, ARCFile) - f.close() - os.unlink("foo.arc") - - -def test_open_unknown_file(): - "Test opening a WARC file" - - with pytest.raises(IOError): - libopen("foo","wb") - - -def test_sample_data(): - import gzip - f = gzip.GzipFile("test_data/alexa_short_header.arc.gz") - a = ARCFile(fileobj = f) - record = str(a.read()) - expected = """http://www.killerjo.net:80/robots.txt 211.111.217.29 20110804181142 39 -SSH-2.0-OpenSSH_5.3p1 Debian-3ubuntu3\r\n\n""" - assert record == expected - - - - diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py index f774940..bd7862a 100644 --- a/warc/tests/test_utils.py +++ b/warc/tests/test_utils.py @@ -1,5 +1,5 @@ from ..utils import FilePart, CaseInsensitiveDict -from io import StringIO +import io class TestCaseInsensitiveDict: def test_all(self): @@ -18,39 +18,47 @@ def test_all(self): assert sorted(d.items()) == [("bar", 2), ("foo", 1)] class TestFilePart: - def setup_method(self, m): + def setup(self): # 5 chars in each line - self.text = "\n".join(["aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"]) + self.text = b"\n".join([b"aaaa", b"bbbb", b"cccc", b"dddd", b"eeee", b"ffff"]) def test_read(self): - part = FilePart(StringIO(self.text), 0) - assert part.read() == "" + part = FilePart(io.BytesIO(self.text), 0) + assert part.read() == b"" - part = FilePart(StringIO(self.text), 5) - assert part.read() == "aaaa\n" + part = FilePart(io.BytesIO(self.text), 5) + assert part.read() == b"aaaa\n" - part = FilePart(StringIO(self.text), 10) - assert part.read() == "aaaa\nbbbb\n" + part = FilePart(io.BytesIO(self.text), 10) + assert part.read() == b"aaaa\nbbbb\n" # try with large data - part = FilePart(StringIO("a" * 10000), 10) + part = FilePart(io.BytesIO(b"a" * 10000), 10) assert len(part.read()) == 10 def test_read_with_size(self): - part = FilePart(StringIO(self.text), 10) - assert part.read(3) == "aaa" - assert part.read(3) == "a\nb" - assert part.read(3) == "bbb" - assert part.read(3) == "\n" - assert part.read(3) == "" + part = FilePart(io.BytesIO(self.text), 10) + assert part.read(3) == b"aaa" + assert part.read(3) == b"a\nb" + assert part.read(3) == b"bbb" + assert part.read(3) == b"\n" + assert part.read(3) == b"" + + def test_read_with_buffer(self): + "Tests read size when read length is larger than buffer." + fb = io.BytesIO(b'a' * 10000) + part = FilePart(fb, 10000) + temp = part.read(100) + part._unread(temp) + assert len(part.read(1000)) == 1000 def test_readline(self): - part = FilePart(StringIO(self.text), 11) - assert part.readline() == "aaaa\n" - assert part.readline() == "bbbb\n" - assert part.readline() == "c" - assert part.readline() == "" + part = FilePart(io.BytesIO(self.text), 11) + assert part.readline() == b"aaaa\n" + assert part.readline() == b"bbbb\n" + assert part.readline() == b"c" + assert part.readline() == b"" def test_iter(self): - part = FilePart(StringIO(self.text), 11) - assert list(part) == ["aaaa\n", "bbbb\n", "c"] \ No newline at end of file + part = FilePart(io.BytesIO(self.text), 11) + assert list(part) == [b"aaaa\n", b"bbbb\n", b"c"] diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 6dd4802..7f722f7 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,6 +1,5 @@ from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile - -from io import StringIO +import io class TestWARCHeader: def test_attrs(self): @@ -53,21 +52,21 @@ def f(type): assert f("newtype")["Content-Type"] == "application/octet-stream" SAMPLE_WARC_RECORD_TEXT = ( - "WARC/1.0\r\n" + - "Content-Length: 10\r\n" + - "WARC-Date: 2012-02-10T16:15:52Z\r\n" + - "Content-Type: application/http; msgtype=response\r\n" + - "WARC-Type: response\r\n" + - "WARC-Record-ID: \r\n" + - "WARC-Target-URI: http://example.com/\r\n" + - "\r\n" + - "Helloworld" + - "\r\n\r\n" + b"WARC/1.0\r\n" + + b"Content-Length: 10\r\n" + + b"WARC-Date: 2012-02-10T16:15:52Z\r\n" + + b"Content-Type: application/http; msgtype=response\r\n" + + b"WARC-Type: response\r\n" + + b"WARC-Record-ID: \r\n" + + b"WARC-Target-URI: http://example.com/\r\n" + + b"\r\n" + + b"Helloworld" + + b"\r\n\r\n" ) class TestWARCReader: def test_read_header1(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT) h = WARCReader(f).read_record().header assert h.date == "2012-02-10T16:15:52Z" assert h.record_id == "" @@ -75,17 +74,17 @@ def test_read_header1(self): assert h.content_length == 10 def test_empty(self): - reader = WARCReader(StringIO("")) + reader = WARCReader(io.BytesIO(b"")) assert reader.read_record() is None def test_read_record(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT) reader = WARCReader(f) record = reader.read_record() - assert "".join(record.payload) == "Helloworld" + assert record.payload.readline() == b"Helloworld" def read_multiple_records(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT * 5) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT * 5) reader = WARCReader(f) for i in range(5): rec = reader.read_record() @@ -93,21 +92,10 @@ def read_multiple_records(self): class TestWarcFile: def test_read(self): - f = WARCFile(fileobj=StringIO(SAMPLE_WARC_RECORD_TEXT)) + f = WARCFile(fileobj=io.BytesIO(SAMPLE_WARC_RECORD_TEXT)) assert f.read_record() is not None assert f.read_record() is None - def test_write_gz(self): - """Test writing multiple member gzip file.""" - buffer = StringIO() - f = WARCFile(fileobj=buffer, mode="w", compress=True) - for i in range(10): - record = WARCRecord(payload="hello %d" % i) - f.write_record(record) - - GZIP_MAGIC_NUMBER = '\037\213' - assert buffer.getvalue().count(GZIP_MAGIC_NUMBER) == 10 - def test_long_header(self): """Test large WARC header with a CRLF across a 1024 byte boundrary""" from .. import warc diff --git a/warc/utils.py b/warc/utils.py index b625275..b8fe720 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -8,7 +8,7 @@ """ from collections import MutableMapping - +from http.client import parse_headers class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. @@ -57,32 +57,32 @@ def __init__(self, fileobj, length): self.length = length self.offset = 0 self.buf = b'' - - def read(self, size): + + def read(self, size=-1): if size == -1: size = self.length - + if len(self.buf) >= size: content = self.buf[:size] self.buf = self.buf[size:] else: - size = min(size, self.length - self.offset - len(self.buf)) - content = self.buf + self.fileobj.read(size) + size = min(size, self.length - self.offset) + content = self.buf + self.fileobj.read(size - len(self.buf)) self.buf = b'' self.offset += len(content) return content - + def _unread(self, content): self.buf = content + self.buf self.offset -= len(content) - + def readline(self, size=1024): chunks = [] chunk = self.read(size) while chunk and b"\n" not in chunk: chunks.append(chunk) chunk = self.read(size) - + if b"\n" in chunk: index = chunk.index(b"\n") self._unread(chunk[index+1:]) @@ -98,21 +98,28 @@ def __iter__(self): class HTTPObject(CaseInsensitiveDict): """Small object to help with parsing HTTP warc entries""" - def __init__(self, file): - + def __init__(self, request_file): #Parse version line - id_Str = file.readline().decode("iso-8859-1") + id_str_raw = request_file.readline() + id_str = self.id_str_raw.decode("iso-8859-1") + if "HTTP" not in id_str: + #This is not an HTTP object. + request_file._unread(self.id_str_raw) + raise ValueError("Object is not HTTP.") + words = id_str.split() command = path = status = error = version = None #If length is not 3 it is a bad version line. - if len(words) == 3: + if len(words) >= 3: if words[1].isdigit(): - version, error, status = words + version = words[0] + error = words[1] + status = " ".join(words[2:]) else: command, path, version = words - self.id = { - "raw": id_Str, + self._id = { + "vline": id_str, "command": command, "path": path, "status": status, @@ -120,48 +127,82 @@ def __init__(self, file): "version": version, } - self.header = parse_headers(request_file) - super().__init__(self.header) - + self._header = parse_headers(request_file) + super().__init__(self._header) self.payload = request_file - def _parseversion(self): - + def __repr__(self): + return(self.vline + str(self._header)) + + def __getitem__(self, name): + try: + return super().__getitem__(name) + except KeyError: + value = name.lower() + if value == "content_type": + return self.content_type + elif value == "charset": + return self.charset + elif value == "host": + return self.host + elif value in self._id: + return self._id[value] + else: + raise + + def _reset(self): + self.payload._unread("\r\n".encode()) + for i in self._header: + value = i + ": " + self._header[i] + "\r\n" + self.payload._unread(value.encode()) + self.payload._unread(self.vline.encode()) + @property def vline(self): - return self.id["raw"] - + return self._id["vline"] + + @property + def version(self): + return self._id["version"] + + #Request @property def command(self): - return self.id["command"] + return self._id["command"] @property def path(self): - return self.id["path"] + return self._id["path"] + + @property + def host(self): + try: + return self._d['host'] + except: + return None + #Response @property def status(self): - return self.id["status"] + return self._id["status"] @property def error(self): - value = self.id["error"] + value = self._id["error"] return int(value) if value else value - @property - def version(self): - return self.id["version"] - + #Inherited from email parser. @property def content_type(self): - return self.header.get_content_type() - + return self._header.get_content_type() + @property def charset(self): - return self.header.get_content_charset() - - def read(self, size=1024): - encoding = self.header.get("Transfer-Encoding", "None") + return self._header.get_content_charset() + + #Havn't used it yet. + def get_payload(self, size=1024): + encoding = self._header.get("Transfer-Encoding", "None") if encoding == "chunked": found = b'' length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) @@ -169,9 +210,8 @@ def read(self, size=1024): found += self.payload.read(length) self.payload.readline() length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) - + return found - - else: - length = self.header.get("Content-Length", -1) - return self.payload.read(length) + + length = int(self._header.get("Content-Length", -1)) + return self.payload.read(length) diff --git a/warc/warc.py b/warc/warc.py index 5f5f67b..69b26e1 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -13,10 +13,11 @@ import uuid import logging import re -from io import StringIO +import io import hashlib +import sys -from .utils import CaseInsensitiveDict, FilePart +from .utils import CaseInsensitiveDict, FilePart, HTTPObject class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. @@ -90,18 +91,18 @@ def init_defaults(self): def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ - f.write(self.version + "\r\n") - for name, value in list(self.items()): + f.write(self.version.encode() + b"\r\n") + for name, value in self.items(): name = name.title() # Use standard forms for commonly used patterns name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI") - f.write(name) - f.write(": ") - f.write(value) - f.write("\r\n") - + f.write(str(name).encode()) + f.write(b": ") + f.write(str(value).encode()) + f.write(b"\r\n") + # Header ends with an extra CRLF - f.write("\r\n") + f.write(b"\r\n") @property def content_length(self): @@ -124,9 +125,9 @@ def date(self): return self['WARC-Date'] def __str__(self): - f = StringIO() + f = io.BytesIO() self.write_to(f) - return f.getvalue() + return str(f.getvalue(), 'utf-8') def __repr__(self): return "" % (self.type, self.record_id) @@ -142,26 +143,43 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): headers.setdefault("WARC-Type", "response") self.header = header or WARCHeader(headers, defaults=True) - self.payload = payload if defaults is True and 'Content-Length' not in self.header: if payload: - self.header['Content-Length'] = str(len(payload)) + self.header['Content-Length'] = len(payload) else: self.header['Content-Length'] = "0" if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) + if isinstance(payload, str): + payload = payload.encode() + if isinstance(payload, bytes): + payload = io.BytesIO(payload) + + self.payload = payload + self.http = False + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() def write_to(self, f): self.header.write_to(f) - f.write(self.payload) - f.write("\r\n") - f.write("\r\n") + if self.http: + self.http._reset() + f.write(self.payload.read()) + f.write(b"\r\n") + f.write(b"\r\n") f.flush() + + def get_HTTP(self): + wtype = self['warc-type'] + if wtype in {"response", "request"}: + http = HTTPObject(self.payload) + self.http = http + return http + return False @property def type(self): @@ -205,9 +223,9 @@ def __contains__(self, name): return name in self.header def __str__(self): - f = StringIO() + f = io.BytesIO() self.write_to(f) - return f.getvalue() + return str(f.getvalue()) def __repr__(self): return "" % (self.type, self['WARC-Record-ID']) @@ -230,7 +248,7 @@ def from_response(response): body = http_response.read() # Monkey-patch the response object so that it is possible to read from it later. - response.raw._fp = StringIO(body) + response.raw._fp = io.BytesIO(body) # Build the payload to create warc file. payload = status_line + "\r\n" + headers + "\r\n" + body @@ -300,7 +318,7 @@ def browse(self): # This will make sure memory consuption is under control and it # is possible to look at the first MB of the payload, which is # typically sufficient to read http headers in the payload. - record.payload = StringIO(record.payload.read(1024*1024)) + record.payload = io.BytesIO(record.payload.read(1024*1024)) self.reader.finish_reading_current_record() next_offset = self.tell() yield record, offset, next_offset-offset From c4895d557abc64dc86e505694a0407a86c25aef3 Mon Sep 17 00:00:00 2001 From: Ryan Chartier Date: Fri, 12 Dec 2014 12:35:21 -0700 Subject: [PATCH 10/28] update to HTTPObject --- warc/utils.py | 69 ++++++++++++++++++++++++++++++++++----------------- warc/warc.py | 20 ++++++++------- 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/warc/utils.py b/warc/utils.py index b8fe720..9dac1e5 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -8,7 +8,8 @@ """ from collections import MutableMapping -from http.client import parse_headers +from http.client import HTTPMessage +import email.parser class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. @@ -101,10 +102,10 @@ class HTTPObject(CaseInsensitiveDict): def __init__(self, request_file): #Parse version line id_str_raw = request_file.readline() - id_str = self.id_str_raw.decode("iso-8859-1") + id_str = id_str_raw.decode("iso-8859-1") if "HTTP" not in id_str: #This is not an HTTP object. - request_file._unread(self.id_str_raw) + request_file._unread(id_str_raw) raise ValueError("Object is not HTTP.") words = id_str.split() @@ -119,7 +120,7 @@ def __init__(self, request_file): command, path, version = words self._id = { - "vline": id_str, + "vline": id_str_raw, "command": command, "path": path, "status": status, @@ -127,10 +128,22 @@ def __init__(self, request_file): "version": version, } - self._header = parse_headers(request_file) + self._header, self.hstring = self._parse_headers(request_file) super().__init__(self._header) self.payload = request_file + @staticmethod + def _parse_headers(fp): + """This is a modification of the python3 http.clint.parse_headers function.""" + headers = [] + while True: + line = fp.readline(65536) + headers.append(line) + if line in (b'\r\n', b'\n', b''): + break + hstring = b''.join(headers) + return email.parser.Parser(_class=HTTPMessage).parsestr(hstring.decode('iso-8859-1')), hstring + def __repr__(self): return(self.vline + str(self._header)) @@ -151,15 +164,19 @@ def __getitem__(self, name): raise def _reset(self): - self.payload._unread("\r\n".encode()) - for i in self._header: - value = i + ": " + self._header[i] + "\r\n" - self.payload._unread(value.encode()) - self.payload._unread(self.vline.encode()) - + self.payload._unread(self.hstring) + self.payload._unread(self._id['vline']) + + def write_to(self, f): + f.write(self._id['vline']) + f.write(self.hstring) + f.write(self.payload.read()) + f.write(b"\r\n\r\n") + f.flush() + @property def vline(self): - return self._id["vline"] + return self._id["vline"].decode("iso-8859-1") @property def version(self): @@ -188,8 +205,7 @@ def status(self): @property def error(self): - value = self._id["error"] - return int(value) if value else value + return self._id["error"] #Inherited from email parser. @property @@ -200,18 +216,25 @@ def content_type(self): def charset(self): return self._header.get_content_charset() - #Havn't used it yet. - def get_payload(self, size=1024): + def write_payload_to(self, fp): encoding = self._header.get("Transfer-Encoding", "None") if encoding == "chunked": found = b'' - length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) + length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16) while length > 0: found += self.payload.read(length) self.payload.readline() - length = int(str(self.payload.readline(), "iso-8859-1").rstrip("\r\n"), 16) - - return found - - length = int(self._header.get("Content-Length", -1)) - return self.payload.read(length) + length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16) + else: + length = int(self._header.get("Content-Length", -1)) + found = self.payload.read(length) + + fp.write(found) + + + + + + + + diff --git a/warc/warc.py b/warc/warc.py index 69b26e1..dbe5e18 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -159,7 +159,7 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): payload = io.BytesIO(payload) self.payload = payload - self.http = False + self._http = None def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() @@ -173,14 +173,16 @@ def write_to(self, f): f.write(b"\r\n") f.flush() - def get_HTTP(self): - wtype = self['warc-type'] - if wtype in {"response", "request"}: - http = HTTPObject(self.payload) - self.http = http - return http - return False - + @property + def http(self): + if self._http is None: +# print(self.header['content-type']) + if 'application/http' in self.header['content-type']: + self._http = HTTPObject(self.payload) + else: + self._http = False + return self._http + @property def type(self): """Record type""" From ed170b77ad84c63c4c8f087d2ff345ec387b1f67 Mon Sep 17 00:00:00 2001 From: recrm Date: Mon, 15 Dec 2014 15:24:33 -0700 Subject: [PATCH 11/28] Added warcscrape.py and supporting files. Added warcscrape.py and supporting files. --- requirements.txt | 2 +- warc/__init__.py | 4 +- warc/arc.py | 98 +++++++++--------- warc/gzip2.py | 121 ---------------------- warc/tests/test_arc.py | 102 +++++++++---------- warc/tests/test_common.py | 2 +- warc/tests/test_utils.py | 20 ++-- warc/utils.py | 126 +++++++++++------------ warc/warc.py | 165 ++++++++++++++++-------------- warcscrape.py | 205 ++++++++++++++++++++++++++++++++++++++ 10 files changed, 467 insertions(+), 378 deletions(-) delete mode 100644 warc/gzip2.py create mode 100644 warcscrape.py diff --git a/requirements.txt b/requirements.txt index e079f8a..f3c7e8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pytest +nose diff --git a/warc/__init__.py b/warc/__init__.py index 098af4c..68634d1 100644 --- a/warc/__init__.py +++ b/warc/__init__.py @@ -15,9 +15,7 @@ def detect_format(filename): """Tries to figure out the type of the file. Return 'warc' for WARC files and 'arc' for ARC files""" - if ".arc" in filename: - return "arc" - if ".warc" in filename: + if filename.endswith(".warc") or filename.endswith(".warc.gz"): return "warc" return "unknown" diff --git a/warc/arc.py b/warc/arc.py index 30c20c9..c1438be 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -1,5 +1,5 @@ """ -Provides support for ARC v1 files. +Provides support for ARC v1 files. :copyright: (c) 2012 Internet Archive """ @@ -28,21 +28,21 @@ class ARCHeader(CaseInsensitiveDict): * content_type * length (length of the n/w doc in bytes) - V2 header fields are + V2 header fields are * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) * length (length of the n/w doc in bytes) """ - def __init__(self, url = "", ip_address = "", date = "", content_type = "", + def __init__(self, url = "", ip_address = "", date = "", content_type = "", result_code = "", checksum = "", location = "", offset = "", filename = "", length = "", version = 2): if isinstance(date, datetime.datetime): @@ -55,8 +55,8 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "", self.version = version - CaseInsensitiveDict.__init__(self, - url = url, + CaseInsensitiveDict.__init__(self, + url = url, ip_address = ip_address, date = date, content_type = content_type, @@ -66,10 +66,10 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "", offset = offset, filename = filename, length = length) - + def write_to(self, f, version = None): """ - Writes out the arc header to the file like object `f`. + Writes out the arc header to the file like object `f`. If the version field is 1, it writes out an arc v1 header, otherwise (and this is default), it outputs a v2 header. @@ -93,44 +93,44 @@ def write_to(self, f, version = None): filename = self['filename'], length = self['length']) f.write(header) - + @property def url(self): return self["url"] - + @property def ip_address(self): return self["ip_address"] - + @property def date(self): return datetime.datetime.strptime(self['date'], "%Y%m%d%H%M%S") - + @property def content_type(self): return self["content_type"] - + @property def result_code(self): return self["result_code"] - + @property def checksum (self): return self["checksum"] - + @property def location(self): return self["location"] - + @property def offset(self): return int(self["offset"]) - + @property def filename(self): return self["filename"] - + @property def length(self): return int(self["length"]) @@ -139,7 +139,7 @@ def __str__(self): f = io.StringIO() self.write_to(f) return f.getvalue() - + def __repr__(self): f = {} for i in "url ip_address date content_typeresult_code checksum location offset filename length".split(): @@ -149,7 +149,7 @@ def __repr__(self): s = ", ".join(s) return ""%s - + class ARCRecord(object): def __init__(self, header = None, payload = None, headers = {}, version = None): if not (header or headers): @@ -157,11 +157,11 @@ def __init__(self, header = None, payload = None, headers = {}, version = None): self.header = header or ARCHeader(version = version, **headers) self.payload = payload self.version = version - + @classmethod def from_string(cls, string, version): """ - Constructs an ARC record from a string and returns it. + Constructs an ARC record from a string and returns it. TODO: It might be best to merge this with the _read_arc_record function rather than reimplement the functionality here. @@ -199,20 +199,20 @@ def __getitem__(self, name): def __setitem__(self, name, value): self.header[name] = value - + def __str__(self): f = io.StringIO() self.write_to(f) return f.getvalue() - - + + class ARCFile(object): def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=False): """ Initialises a file like object that can be used to read or write Arc files. Works for both version 1 or version 2. - This can be called similar to the builtin `file` constructor. + This can be called similar to the builtin `file` constructor. It can also just be given a fileobj which is a file like object that it will use directly for its work. @@ -224,7 +224,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ * ip_address - IP address of the machine doing the Archiving * date - Date of archival - * org - Organisation that's doing the Archiving. + * org - Organisation that's doing the Archiving. The version parameter tries to work intuitively as follows @@ -248,7 +248,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ * When we try to read a record, it will read out one record and try to guess the version from it (for the first read). - + """ if fileobj is None: fileobj = builtins.open(filename, mode or "rb") @@ -259,7 +259,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ if compress: fileobj = gzip.open(fileobj, mode) - + self.fileobj = fileobj self.filename = filename @@ -282,7 +282,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - + def _write_header(self): "Writes out an ARC header" if "org" not in self.file_headers: @@ -301,15 +301,15 @@ def _write_header(self): payload = "2 0 %(org)s\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length" else: raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version) - + fname = os.path.basename(self.filename) header = ARCHeader(url = "filedesc://%s"%fname, - ip_address = self.file_headers['ip_address'], + ip_address = self.file_headers['ip_address'], date = self.file_headers['date'], - content_type = "text/plain", + content_type = "text/plain", length = len(payload), result_code = "200", - checksum = "-", + checksum = "-", location = "-", offset = str(self.fileobj.tell()), filename = fname) @@ -340,7 +340,7 @@ def _read_file_header(self): # print "--------------------------------------------------" if self.version and int(self.version) != version: raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version)) - + if version == '1': url, ip_address, date, content_type, length = header.split() self.file_headers = {"ip_address" : ip_address, @@ -404,13 +404,13 @@ def _read_arc_record(self): self.fileobj.readline() # Munge the separator newline. return ARCRecord(header = arc_header, payload = payload) - + def read(self): "Reads out an arc record from the file" if not self.header_read: self._read_file_header() return self._read_arc_record() - + # For compatability with WARCFile read_record = read write_record = write @@ -420,16 +420,16 @@ def __iter__(self): while record: yield record record = self.read() - + def close(self): self.fileobj.close() - - - - - - - - - - + + + + + + + + + + diff --git a/warc/gzip2.py b/warc/gzip2.py deleted file mode 100644 index bc72c18..0000000 --- a/warc/gzip2.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Enhanced gzip library to support multiple member gzip files. - -GZIP has an interesting property that contatination of mutliple gzip files is a valid gzip file. -In other words, a gzip file can have multiple members, each individually gzip -compressed. The members simply appear one after another in the file, with no -additional information before, between, or after them. - -See GZIP RFC for more information. - -http://www.gzip.org/zlib/rfc-gzip.html - -This library provides support for creating and reading multi-member gzip files. -""" -from gzip import WRITE, READ, write32u, GzipFile as BaseGzipFile -import zlib - -def open(filename, mode="rb", compresslevel=9): - """Shorthand for GzipFile(filename, mode, compresslevel). - """ - return GzipFile(filename, mode, compresslevel) - -class GzipFile(BaseGzipFile): - """GzipFile with support for multi-member gzip files. - """ - def __init__(self, filename=None, mode=None, - compresslevel=9, fileobj=None): - BaseGzipFile.__init__(self, - filename=filename, - mode=mode, - compresslevel=compresslevel, - fileobj=fileobj) - - if self.mode == WRITE: - # Indicates the start of a new member if value is True. - # The BaseGzipFile constructor already wrote the header for new - # member, so marking as False. - self._new_member = False - - # When _member_lock is True, only one member in gzip file is read - self._member_lock = False - - def close_member(self): - """Closes the current member being written. - """ - # The new member is not yet started, no need to close - if self._new_member: - return - - self.fileobj.write(self.compress.flush()) - write32u(self.fileobj, self.crc) - # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffff) - self.size = 0 - self.compress = zlib.compressobj(9, - zlib.DEFLATED, - -zlib.MAX_WBITS, - zlib.DEF_MEM_LEVEL, - 0) - self._new_member = True - - def _start_member(self): - """Starts writing a new member if required. - """ - if self._new_member: - self._init_write(self.name) - self._write_gzip_header() - self._new_member = False - - def write(self, data): - self._start_member() - BaseGzipFile.write(self, data) - - def close(self): - """Closes the gzip with care to handle multiple members. - """ - if self.fileobj is None: - return - if self.mode == WRITE: - self.close_member() - self.fileobj = None - elif self.mode == READ: - self.fileobj = None - - if self.myfileobj: - self.myfileobj.close() - self.myfileobj = None - - def _read(self, size): - # Treat end of member as end of file when _member_lock flag is set - if self._member_lock and self._new_member: - raise EOFError() - else: - return BaseGzipFile._read(self, size) - - def read_member(self): - """Returns a file-like object to read one member from the gzip file. - """ - if self._member_lock is False: - self._member_lock = True - - if self._new_member: - try: - # Read one byte to move to the next member - BaseGzipFile._read(self, 1) -# assert self._new_member is False - except EOFError: - return None - - return self - - def write_member(self, data): - """Writes the given data as one gzip member. - - The data can be a string, an iterator that gives strings or a file-like object. - """ - if isinstance(data, str): - self.write(data) - else: - for text in data: - self.write(text) - self.close_member() diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index 6b6f16e..d0f0b9c 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -8,20 +8,20 @@ # "Make sure Header can be initialise only with expected fields" # with pytest.raises(TypeError): # arc.ARCHeader(test="1234") - + def test_arc_header_attributes(): "Make sure that ARC1 header fields are accessible as attributes. Double check for attributes that are converted for convenience (e.g. date and length)" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - + assert header.url == "http://archive.org" assert header.ip_address == "127.0.0.1" assert header.date == datetime.datetime.strptime("20120301093000", "%Y%m%d%H%M%S") @@ -34,16 +34,16 @@ def test_arc_header_attributes(): assert header.location == "http://www.archive.org" assert header.offset == 300 assert header.filename == "sample.arc.gz" - + def test_arc_v1_header_creation(): "Validate ARC V1 header creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -51,17 +51,17 @@ def test_arc_v1_header_creation(): header.write_to(f, 1) header_v1_string = f.getvalue() assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500" - - + + def test_arc_v2_header_creation(): "Validate ARC V2 header creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -69,17 +69,17 @@ def test_arc_v2_header_creation(): header.write_to(f) header_v2_string = f.getvalue() assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500" - - + + def test_arc_v1_record_creation(): "Validate ARC V1 record creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -92,12 +92,12 @@ def test_arc_v1_record_creation(): def test_arc_v2_record_creation(): "Validate ARC V1 record creation" header = dict(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -120,8 +120,8 @@ def test_arc_v1_writer(): f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = dict(url = "http://www.archive.org", - ip_address = "127.0.0.1", - date = now, + ip_address = "127.0.0.1", + date = now, content_type = "text/html", length = len(payload)) r = arc.ARCRecord(headers = header, payload = payload) @@ -137,12 +137,12 @@ def test_arc1_v1_writer_default_headers(): opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. - + f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = dict(url = "http://www.archive.org", - ip_address = "127.0.0.1", - date = now, + ip_address = "127.0.0.1", + date = now, content_type = "text/html", length = len(payload)) r = arc.ARCRecord(headers = header, payload = payload) @@ -164,12 +164,12 @@ def test_arc_v2_writer(): f = arc.ARCFile(fileobj = opfile, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -182,16 +182,16 @@ def test_arc_reader_guess_version(): "Make sure that the ARCFile object automatically detects the file version" v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") - + arc_v1 = arc.ARCFile(fileobj = v1) arc_v2 = arc.ARCFile(fileobj = v2) arc_v1.read() arc_v2.read() - + assert arc_v1.version == 1 assert arc_v2.version == 2 - + def test_arc_reader_read_file_headers(): "Make sure that the parser is reading file headers properly" ip = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") @@ -202,14 +202,14 @@ def test_arc_reader_read_file_headers(): arc_file.file_headers['org'] == "Internet Archive" -def test_arc_reader_v1(): +def test_arc_reader_v1(): "Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)" v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") - arc_file = arc.ARCFile(fileobj = v1) + arc_file = arc.ARCFile(fileobj = v1) r1 = arc_file.read() r2 = arc_file.read() - + assert r1['url'] == "http://www.archive.org" assert r1['ip_address'] == "127.0.0.1" assert r1['date'] == "20120302193210" @@ -225,12 +225,12 @@ def test_arc_reader_v1(): assert r2.payload == "Payload2" -def test_arc_reader_v2(): +def test_arc_reader_v2(): "Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)" v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") - arc_file = arc.ARCFile(fileobj = v2) + arc_file = arc.ARCFile(fileobj = v2) r1, r2 = list(arc_file) - + assert r1['url'] == "http://archive.org" assert r1['ip_address'] == "127.0.0.1" assert r1['date'] == "20120301093000" @@ -274,12 +274,12 @@ def test_arc_v2_record_from_string(): def test_arc_record_versions(): "Check initialising an ARCRecord with a version to see if it outputs stuff properly" header = dict(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -304,7 +304,7 @@ def test_write_headers(self): f.name = "sample.arc" afile = arc.ARCFile(fileobj=f, version=1) afile._write_header() - + # Make sure header is written only once assert f.getvalue().count("filedesc://") == 1 @@ -322,4 +322,4 @@ def test_no_filename(self): afile._write_header() # filename should be empty assert f.getvalue().startswith("filedesc:// ") - + diff --git a/warc/tests/test_common.py b/warc/tests/test_common.py index 7981047..4ad14ec 100644 --- a/warc/tests/test_common.py +++ b/warc/tests/test_common.py @@ -5,7 +5,7 @@ def test_open_warc_file(): "Test opening a WARC file" - + f = libopen("foo.warc","wb") assert isinstance(f, WARCFile) f.close() diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py index bd7862a..6be5abc 100644 --- a/warc/tests/test_utils.py +++ b/warc/tests/test_utils.py @@ -1,5 +1,5 @@ from ..utils import FilePart, CaseInsensitiveDict -import io +import io class TestCaseInsensitiveDict: def test_all(self): @@ -13,29 +13,29 @@ def test_all(self): d['BAR'] = 2 assert 'bar' in d assert d['bar'] == 2 - + assert sorted(d.keys()) == ["bar", "foo"] assert sorted(d.items()) == [("bar", 2), ("foo", 1)] - + class TestFilePart: def setup(self): # 5 chars in each line self.text = b"\n".join([b"aaaa", b"bbbb", b"cccc", b"dddd", b"eeee", b"ffff"]) - + def test_read(self): part = FilePart(io.BytesIO(self.text), 0) assert part.read() == b"" - + part = FilePart(io.BytesIO(self.text), 5) assert part.read() == b"aaaa\n" part = FilePart(io.BytesIO(self.text), 10) assert part.read() == b"aaaa\nbbbb\n" - + # try with large data part = FilePart(io.BytesIO(b"a" * 10000), 10) assert len(part.read()) == 10 - + def test_read_with_size(self): part = FilePart(io.BytesIO(self.text), 10) assert part.read(3) == b"aaa" @@ -43,7 +43,7 @@ def test_read_with_size(self): assert part.read(3) == b"bbb" assert part.read(3) == b"\n" assert part.read(3) == b"" - + def test_read_with_buffer(self): "Tests read size when read length is larger than buffer." fb = io.BytesIO(b'a' * 10000) @@ -51,14 +51,14 @@ def test_read_with_buffer(self): temp = part.read(100) part._unread(temp) assert len(part.read(1000)) == 1000 - + def test_readline(self): part = FilePart(io.BytesIO(self.text), 11) assert part.readline() == b"aaaa\n" assert part.readline() == b"bbbb\n" assert part.readline() == b"c" assert part.readline() == b"" - + def test_iter(self): part = FilePart(io.BytesIO(self.text), 11) assert list(part) == [b"aaaa\n", b"bbbb\n", b"c"] diff --git a/warc/utils.py b/warc/utils.py index 9dac1e5..ecd9f97 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -7,13 +7,17 @@ :copyright: (c) 2012 Internet Archive """ -from collections import MutableMapping +from collections import MutableMapping, Mapping from http.client import HTTPMessage import email.parser +import sys +import re + +SEP = re.compile("[;:=]") class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. - + >>> d = CaseInsensitiveDict(foo=1, Bar=2) >>> d['foo'] 1 @@ -28,36 +32,36 @@ class CaseInsensitiveDict(MutableMapping): def __init__(self, *args, **kwargs): self._d = {} self.update(dict(*args, **kwargs)) - + def __setitem__(self, name, value): self._d[name.lower()] = value - + def __getitem__(self, name): return self._d[name.lower()] - + def __delitem__(self, name): del self._d[name.lower()] - + def __eq__(self, other): return isinstance(other, CaseInsensitiveDict) and other._d == self._d - + def __iter__(self): return iter(self._d) - + def __len__(self): return len(self._d) class FilePart: """File interface over a part of file. - - Takes a file and length to read from the file and returns a file-object + + Takes a file and length to read from the file and returns a file-object over that part of the file. """ def __init__(self, fileobj, length): self.fileobj = fileobj self.length = length self.offset = 0 - self.buf = b'' + self.buf = b'' def read(self, size=-1): if size == -1: @@ -96,7 +100,7 @@ def __iter__(self): while line: yield line line = self.readline() - + class HTTPObject(CaseInsensitiveDict): """Small object to help with parsing HTTP warc entries""" def __init__(self, request_file): @@ -107,7 +111,7 @@ def __init__(self, request_file): #This is not an HTTP object. request_file._unread(id_str_raw) raise ValueError("Object is not HTTP.") - + words = id_str.split() command = path = status = error = version = None #If length is not 3 it is a bad version line. @@ -118,7 +122,7 @@ def __init__(self, request_file): status = " ".join(words[2:]) else: command, path, version = words - + self._id = { "vline": id_str_raw, "command": command, @@ -127,11 +131,12 @@ def __init__(self, request_file): "error": error, "version": version, } - + self._header, self.hstring = self._parse_headers(request_file) super().__init__(self._header) self.payload = request_file - + self._content = None + @staticmethod def _parse_headers(fp): """This is a modification of the python3 http.clint.parse_headers function.""" @@ -143,21 +148,19 @@ def _parse_headers(fp): break hstring = b''.join(headers) return email.parser.Parser(_class=HTTPMessage).parsestr(hstring.decode('iso-8859-1')), hstring - + def __repr__(self): return(self.vline + str(self._header)) - + def __getitem__(self, name): try: return super().__getitem__(name) except KeyError: value = name.lower() if value == "content_type": - return self.content_type - elif value == "charset": - return self.charset - elif value == "host": - return self.host + return self.content.type + elif value in self.content: + return self.content[value] elif value in self._id: return self._id[value] else: @@ -166,14 +169,24 @@ def __getitem__(self, name): def _reset(self): self.payload._unread(self.hstring) self.payload._unread(self._id['vline']) - + def write_to(self, f): f.write(self._id['vline']) f.write(self.hstring) f.write(self.payload.read()) f.write(b"\r\n\r\n") f.flush() - + + @property + def content(self): + if self._content is None: + try: + string = self._d["content-type"] + except KeyError: + string = '' + self._content = ContentType(string) + return self._content + @property def vline(self): return self._id["vline"].decode("iso-8859-1") @@ -182,40 +195,6 @@ def vline(self): def version(self): return self._id["version"] - #Request - @property - def command(self): - return self._id["command"] - - @property - def path(self): - return self._id["path"] - - @property - def host(self): - try: - return self._d['host'] - except: - return None - - #Response - @property - def status(self): - return self._id["status"] - - @property - def error(self): - return self._id["error"] - - #Inherited from email parser. - @property - def content_type(self): - return self._header.get_content_type() - - @property - def charset(self): - return self._header.get_content_charset() - def write_payload_to(self, fp): encoding = self._header.get("Transfer-Encoding", "None") if encoding == "chunked": @@ -228,13 +207,24 @@ def write_payload_to(self, fp): else: length = int(self._header.get("Content-Length", -1)) found = self.payload.read(length) - + fp.write(found) - - - - - - - - + +class ContentType(CaseInsensitiveDict): + def __init__(self, string): + data = {} + self.type = '' + if string: + _list = [i.strip() for i in string.lower().split(";")] + self.type = _list[0] + + data["type"] = _list[0] + for i in _list[1:]: + test = [n.strip() for n in re.split(SEP, i)] + data[test[0]] = test[1] + + super().__init__(data) + + def __repr__(self): + return self.type + diff --git a/warc/warc.py b/warc/warc.py index dbe5e18..2321051 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -17,22 +17,22 @@ import hashlib import sys -from .utils import CaseInsensitiveDict, FilePart, HTTPObject +from .utils import CaseInsensitiveDict, FilePart, HTTPObject, ContentType class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. - It provides dictionary like interface for accessing the headers. - + It provides dictionary like interface for accessing the headers. + The following mandatory fields are accessible also as attributes. - + * h.record_id == h['WARC-Record-ID'] * h.content_length == int(h['Content-Length']) * h.date == h['WARC-Date'] * h.type == h['WARC-Type'] - - :params headers: dictionary of headers. - :params defaults: If True, important headers like WARC-Record-ID, + + :params headers: dictionary of headers. + :params defaults: If True, important headers like WARC-Record-ID, WARC-Date, Content-Type and Content-Length are initialized to automatically if not already present. TODO: @@ -41,9 +41,9 @@ class WARCHeader(CaseInsensitiveDict): * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) @@ -54,7 +54,7 @@ class WARCHeader(CaseInsensitiveDict): response='application/http; msgtype=response', request='application/http; msgtype=request', metadata='application/warc-fields') - + KNOWN_HEADERS = { "type": "WARC-Type", "date": "WARC-Date", @@ -66,16 +66,16 @@ class WARCHeader(CaseInsensitiveDict): "content_type": "Content-Type", "content_length": "Content-Length" } - + def __init__(self, headers, defaults=False): self.version = "WARC/1.0" super().__init__(headers) if defaults: self.init_defaults() - + def init_defaults(self): """Initializes important headers to default values, if not already specified. - + The WARC-Record-ID header is set to a newly generated UUID. The WARC-Date header is set to the current datetime. The Content-Type is set based on the WARC-Type header. @@ -87,7 +87,7 @@ def init_defaults(self): self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') if "Content-Type" not in self: self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream") - + def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ @@ -100,7 +100,7 @@ def write_to(self, f): f.write(b": ") f.write(str(value).encode()) f.write(b"\r\n") - + # Header ends with an extra CRLF f.write(b"\r\n") @@ -108,27 +108,27 @@ def write_to(self, f): def content_length(self): """The Content-Length header as int.""" return int(self['Content-Length']) - + @property - def type(self): + def type(self): """The value of WARC-Type header.""" return self['WARC-Type'] - + @property def record_id(self): """The value of WARC-Record-ID header.""" return self['WARC-Record-ID'] - + @property def date(self): """The value of WARC-Date header.""" return self['WARC-Date'] - + def __str__(self): f = io.BytesIO() self.write_to(f) return str(f.getvalue(), 'utf-8') - + def __repr__(self): return "" % (self.type, self.record_id) @@ -136,34 +136,35 @@ class WARCRecord(object): """The WARCRecord object represents a WARC Record. """ def __init__(self, header=None, payload=None, headers={}, defaults=True): - """Creates a new WARC record. + """Creates a new WARC record. """ if header is None and defaults is True: headers.setdefault("WARC-Type", "response") self.header = header or WARCHeader(headers, defaults=True) - + if defaults is True and 'Content-Length' not in self.header: if payload: self.header['Content-Length'] = len(payload) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - + if isinstance(payload, str): payload = payload.encode() if isinstance(payload, bytes): payload = io.BytesIO(payload) - + self.payload = payload self._http = None - + self._content = None + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() - + def write_to(self, f): self.header.write_to(f) if self.http: @@ -172,17 +173,26 @@ def write_to(self, f): f.write(b"\r\n") f.write(b"\r\n") f.flush() - + + @property + def content(self): + if self._content is None: + try: + string = self.header["content-type"] + except KeyError: + string = '' + self._content = ContentType(string) + return self._content + @property def http(self): if self._http is None: -# print(self.header['content-type']) if 'application/http' in self.header['content-type']: self._http = HTTPObject(self.payload) else: self._http = False return self._http - + @property def type(self): """Record type""" @@ -192,11 +202,11 @@ def type(self): def url(self): """The value of the WARC-Target-URI header if the record is of type "response".""" return self.header.get('WARC-Target-URI') - + @property def ip_address(self): - """The IP address of the host contacted to retrieve the content of this record. - + """The IP address of the host contacted to retrieve the content of this record. + This value is available from the WARC-IP-Address header.""" return self.header.get('WARC-IP-Address') @@ -204,46 +214,53 @@ def ip_address(self): def date(self): """UTC timestamp of the record.""" return self.header.get("WARC-Date") - + @property def checksum(self): return self.header.get('WARC-Payload-Digest') - + @property def offset(self): """Offset of this record in the warc file from which this record is read. """ pass - + def __getitem__(self, name): - return self.header[name] + try: + return self.header[name] + except KeyError: + if name == "content_type": + return self.content.type + elif name in self.content: + return self.content[name] + def __setitem__(self, name, value): self.header[name] = value - + def __contains__(self, name): return name in self.header - + def __str__(self): f = io.BytesIO() self.write_to(f) return str(f.getvalue()) - + def __repr__(self): return "" % (self.type, self['WARC-Record-ID']) - + @staticmethod def from_response(response): """Creates a WARCRecord from given response object. - This must be called before reading the response. The response can be + This must be called before reading the response. The response can be read after this method is called. - + :param response: An instance of :class:`requests.models.Response`. """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - + # HTTP status line, headers and body as strings status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) @@ -254,7 +271,7 @@ def from_response(response): # Build the payload to create warc file. payload = status_line + "\r\n" + headers + "\r\n" + body - + headers = { "WARC-Type": "response", "WARC-Target-URI": response.request.url.encode('utf-8') @@ -269,56 +286,56 @@ def __init__(self, filename=None, mode=None, fileobj=None, compress=None): # initiaize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): compress = True - + if compress: fileobj = gzip.open(fileobj, mode) - + self.fileobj = fileobj self._reader = None - + def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() - + def __iter__(self): return iter(self.reader) - + @property def reader(self): if self._reader is None: self._reader = WARCReader(self.fileobj) return self._reader - + def write_record(self, warc_record): """Adds a warc record to this WARC file. """ warc_record.write_to(self.fileobj) - + def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - - + + def close(self): self.fileobj.close() - + def browse(self): """Utility to browse through the records in the warc file. - - This returns an iterator over (record, offset, size) for each record in - the file. If the file is gzip compressed, the offset and size will - corresponds to the compressed file. - - The payload of each record is limited to 1MB to keep memory consumption + + This returns an iterator over (record, offset, size) for each record in + the file. If the file is gzip compressed, the offset and size will + corresponds to the compressed file. + + The payload of each record is limited to 1MB to keep memory consumption under control. """ offset = 0 for record in self.reader: # Just read the first 1MB of the payload. - # This will make sure memory consuption is under control and it - # is possible to look at the first MB of the payload, which is + # This will make sure memory consuption is under control and it + # is possible to look at the first MB of the payload, which is # typically sufficient to read http headers in the payload. record.payload = io.BytesIO(record.payload.read(1024*1024)) self.reader.finish_reading_current_record() @@ -329,29 +346,29 @@ def browse(self): def tell(self): """Returns the file offset. """ - return self.fileobj.tell() - + return self.fileobj.tell() + class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n") SUPPORTED_VERSIONS = ["1.0"] - + def __init__(self, fileobj): self.fileobj = fileobj self.current_payload = None - + def read_header(self, fileobj): version_line = fileobj.readline().decode("utf-8") if not version_line: return None - + m = self.RE_VERSION.match(version_line) if not m: raise IOError("Bad version line: %r" % version_line) version = m.group(1) if version not in self.SUPPORTED_VERSIONS: raise IOError("Unsupported WARC version: %s" % version) - + headers = {} while True: line = fileobj.readline().decode("utf-8") @@ -363,13 +380,13 @@ def read_header(self, fileobj): name, value = m.groups() headers[name] = value return WARCHeader(headers) - + def expect(self, fileobj, expected_line, message=None): line = fileobj.readline().decode("utf-8") if line != expected_line: message = message or "Expected %r, found %r" % (expected_line, line) raise IOError(message) - + def finish_reading_current_record(self): # consume the footer from the previous record if self.current_payload: @@ -382,11 +399,11 @@ def finish_reading_current_record(self): def read_record(self): self.finish_reading_current_record() fileobj = self.fileobj - + header = self.read_header(fileobj) if header is None: return None - + self.current_payload = FilePart(fileobj, header.content_length) record = WARCRecord(header, self.current_payload, defaults=False) return record diff --git a/warcscrape.py b/warcscrape.py new file mode 100644 index 0000000..f3776de --- /dev/null +++ b/warcscrape.py @@ -0,0 +1,205 @@ +#! /usr/bin/env python3 +import os +import re +import argparse +import warc +import sys +import mimetypes +from urllib.parse import urlparse, unquote +from pprint import pprint +import shutil + +counts = {} + +class filterObject: + """Basic object for storing filters.""" + def __init__(self, string): + self.result = True + if string[0] == "!": + self.result = False + string = string[1:] + + _list = string.lower().split(":") + + self.http = (_list[0] == 'http') + if self.http: + del _list[0] + + self.k = _list[0] + self.v = _list[1] + +def inc(obj, header=False, dic=False): + """Short script for counting entries.""" + if header: + try: + obj = obj[header] + except KeyError: + obj = None + + holder = counts + if dic: + if dic not in counts: + counts[dic] = {} + holder = counts[dic] + + if obj in holder: + holder[obj] += 1 + else: + holder[obj] = 1 + +def warc_records(string, path): + """Iterates over warc records in path.""" + for filename in os.listdir(path): + if re.search(string, filename) and ".warc" in filename: + print("parsing", filename) + with warc.open(path + filename) as warc_file: + for record in warc_file: + yield record + +def checkFilter(filters, record): + """Check record against filters.""" + for i in filters: + if i.http: + if not record.http: + return False + value = record.http + else: + value = record.header + + string = value.get(i.k, None) + if not string or (i.v in string) != i.result: + return False + return True + +def parse(args): + #Clear output warc file. + if args.dump == "warc": + print("Recording", args.dump, "to", args.output + ".") + with open(args.output_path + args.output, "wb"): + pass + + for record in warc_records(args.string, args.path): + try: + #Filter out unwanted entries. + if not checkFilter(args.filter, record): + continue + + #Increment Index counters. + if args.silence: + inc("records") + inc(record,"warc-type", "types") + inc(record, "content_type", "warc-content") + if record.http: + inc(record.http, "content_type", "http-content") + inc(record.http, "error", "status") + + #Dump records to file. + if args.dump == "warc": + with open(args.output_path + args.output, "ab") as output: + record.write_to(output) + + if args.dump == "content": + url = urlparse(unquote(record['WARC-Target-URI'])) + + #Set up folder + index = url.path.rfind("/") + 1 + file = url.path[index:] + path = url.path[:index] + + #Process filename + if "." not in file: + path += file + if not path.endswith("/"): + path += "/" + + file = 'index.html' + + #Final fixes. + path = path.replace(".", "-") + host = url.hostname.replace('www.', '', 1) + path = args.output_path + host + path + + #Create new directories + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError: + path = "/".join([i[:25] for i in path.split("/")]) + os.makedirs(path) + + #Test if file has a proper extension. + index = file.index(".") + suffix = file[index:] + content = record.http.get("content_type", "") + slist = mimetypes.guess_all_extensions(content) + if suffix not in slist: + #Correct suffix if we can. + suffix = mimetypes.guess_extension(content) + if suffix: + file = file[:index] + suffix + else: + inc(record.http, "content_type", "unknown mime type") + + #Check for gzip compression. + if record.http.get("content-encoding", None) == "gzip": + file += ".gz" + + path += file + + #If Duplicate file then insert numbers + index = path.rfind(".") + temp = path + n = 0 + while os.path.isfile(temp): + n +=1 + temp = path[:index] + "("+ str(n) + ")" + path[index:] + path = temp + + #Write file. + with open(path, 'wb') as fp: + record.http.write_payload_to(fp) + except: + if args.error: + print("Error in record. Recording to error.warc.") + with open(args.output_path + "error.warc", "wb") as fp: + record.write_to(fp) + else: + raise + + #print results + if args.silence: + print("-----------------------------") + for i in counts: + print("\nCount of {}.".format(i)) + pprint(counts[i]) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Extracts attributes from warc files.') + parser.add_argument("filter", nargs='*', help="Attributes to filter by. Entries that do not contain filtered elements are ignored. Example: warc-type:response, would ignore all warc entries that are not responses. Attributes in an HTTP object should be prefixed by 'http'. Example, http:error:200.") + parser.add_argument("-silence", action="store_false", help="Silences output of warc data.") + parser.add_argument("-error", action="store_true", help="Silences most errors and records problematic warc entries to error.warc.") + parser.add_argument("-string", default="", help="Regular expression to limit parsed warc files. Defaults to empty string.") + parser.add_argument("-path", default="./", help="Path to folder containing warc files. Defaults to current folder.") + parser.add_argument("-output_path", default="data/", help="Path to folder to dump content files. Defaults to data/ folder.") + parser.add_argument("-output", default="output.warc", help="File to output warc contents. Defaults to 'output.warc'.") + parser.add_argument("-dump", choices=['warc', 'content'], type=str, help="Dumps all entries that survived filter. 'warc' creates a filtered warc file. 'content' tries to reproduce file structure of archived websites.") + args = parser.parse_args() + + if args.path[-1] != "/": + args.path += "/" + + if args.output_path[-1] != "/": + args.output_path += "/" + + if not os.path.exists(args.output_path): + os.makedirs(args.output_path) + + #Forced filters + if args.dump == "content": + args.filter.append("warc-type:response") + args.filter.append("content-type:application/http") + + args.filter = [filterObject(i) for i in args.filter] + + args.string = re.compile(args.string) + parse(args) From 00be647a1e31b0ff87311cde2c0186be5b6fc779 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 11:56:41 +0200 Subject: [PATCH 12/28] Fix TypeError gzip.open() gzip.open() expects a string as filename. In the prior implementation it passed a fileobject as argument. This resulted in a type error. Changing it to pass the name of the file as argument to gzip.open() fixes this problem. --- warc/warc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc/warc.py b/warc/warc.py index 2321051..57fd049 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -288,7 +288,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, compress=None): compress = True if compress: - fileobj = gzip.open(fileobj, mode) + fileobj = gzip.open(fileobj.name, mode) self.fileobj = fileobj self._reader = None From 9679eadc770fe5b6f1d525da59241b78b9265344 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 16:29:52 +0200 Subject: [PATCH 13/28] Fix TypeError --- warc/warc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc/warc.py b/warc/warc.py index 57fd049..fb68eb6 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -270,7 +270,7 @@ def from_response(response): response.raw._fp = io.BytesIO(body) # Build the payload to create warc file. - payload = status_line + "\r\n" + headers + "\r\n" + body + payload = status_line + "\r\n" + headers + "\r\n" + str(body) headers = { "WARC-Type": "response", From dea56d69c5c302c290aede35b0782d65ad700d78 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 16:38:28 +0200 Subject: [PATCH 14/28] Fix TypeError: Unicode-objects must be encoded before hashing --- warc/warc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc/warc.py b/warc/warc.py index fb68eb6..b48b615 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -163,7 +163,7 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self._content = None def _compute_digest(self, payload): - return "sha1:" + hashlib.sha1(payload).hexdigest() + return "sha1:" + hashlib.sha1(payload.encode()).hexdigest() def write_to(self, f): self.header.write_to(f) From 31a12173a05ec92780ccda16bcf10256b4615d48 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 16:53:47 +0200 Subject: [PATCH 15/28] Factor out HTTPObject --- warc/utils.py | 128 -------------------------------------------------- warc/warc.py | 12 ----- 2 files changed, 140 deletions(-) diff --git a/warc/utils.py b/warc/utils.py index ecd9f97..4298fa2 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -100,131 +100,3 @@ def __iter__(self): while line: yield line line = self.readline() - -class HTTPObject(CaseInsensitiveDict): - """Small object to help with parsing HTTP warc entries""" - def __init__(self, request_file): - #Parse version line - id_str_raw = request_file.readline() - id_str = id_str_raw.decode("iso-8859-1") - if "HTTP" not in id_str: - #This is not an HTTP object. - request_file._unread(id_str_raw) - raise ValueError("Object is not HTTP.") - - words = id_str.split() - command = path = status = error = version = None - #If length is not 3 it is a bad version line. - if len(words) >= 3: - if words[1].isdigit(): - version = words[0] - error = words[1] - status = " ".join(words[2:]) - else: - command, path, version = words - - self._id = { - "vline": id_str_raw, - "command": command, - "path": path, - "status": status, - "error": error, - "version": version, - } - - self._header, self.hstring = self._parse_headers(request_file) - super().__init__(self._header) - self.payload = request_file - self._content = None - - @staticmethod - def _parse_headers(fp): - """This is a modification of the python3 http.clint.parse_headers function.""" - headers = [] - while True: - line = fp.readline(65536) - headers.append(line) - if line in (b'\r\n', b'\n', b''): - break - hstring = b''.join(headers) - return email.parser.Parser(_class=HTTPMessage).parsestr(hstring.decode('iso-8859-1')), hstring - - def __repr__(self): - return(self.vline + str(self._header)) - - def __getitem__(self, name): - try: - return super().__getitem__(name) - except KeyError: - value = name.lower() - if value == "content_type": - return self.content.type - elif value in self.content: - return self.content[value] - elif value in self._id: - return self._id[value] - else: - raise - - def _reset(self): - self.payload._unread(self.hstring) - self.payload._unread(self._id['vline']) - - def write_to(self, f): - f.write(self._id['vline']) - f.write(self.hstring) - f.write(self.payload.read()) - f.write(b"\r\n\r\n") - f.flush() - - @property - def content(self): - if self._content is None: - try: - string = self._d["content-type"] - except KeyError: - string = '' - self._content = ContentType(string) - return self._content - - @property - def vline(self): - return self._id["vline"].decode("iso-8859-1") - - @property - def version(self): - return self._id["version"] - - def write_payload_to(self, fp): - encoding = self._header.get("Transfer-Encoding", "None") - if encoding == "chunked": - found = b'' - length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16) - while length > 0: - found += self.payload.read(length) - self.payload.readline() - length = int(str(self.payload.readline(), "iso-8859-1").rstrip(), 16) - else: - length = int(self._header.get("Content-Length", -1)) - found = self.payload.read(length) - - fp.write(found) - -class ContentType(CaseInsensitiveDict): - def __init__(self, string): - data = {} - self.type = '' - if string: - _list = [i.strip() for i in string.lower().split(";")] - self.type = _list[0] - - data["type"] = _list[0] - for i in _list[1:]: - test = [n.strip() for n in re.split(SEP, i)] - data[test[0]] = test[1] - - super().__init__(data) - - def __repr__(self): - return self.type - diff --git a/warc/warc.py b/warc/warc.py index b48b615..3501221 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -159,7 +159,6 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): payload = io.BytesIO(payload) self.payload = payload - self._http = None self._content = None def _compute_digest(self, payload): @@ -167,8 +166,6 @@ def _compute_digest(self, payload): def write_to(self, f): self.header.write_to(f) - if self.http: - self.http._reset() f.write(self.payload.read()) f.write(b"\r\n") f.write(b"\r\n") @@ -184,15 +181,6 @@ def content(self): self._content = ContentType(string) return self._content - @property - def http(self): - if self._http is None: - if 'application/http' in self.header['content-type']: - self._http = HTTPObject(self.payload) - else: - self._http = False - return self._http - @property def type(self): """Record type""" From 1fd24d823edab51c04f07cfbad03ea88d1761e44 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 17:02:44 +0200 Subject: [PATCH 16/28] Fix _compute_digest() --- warc/warc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 3501221..71bcc07 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -150,19 +150,19 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): else: self.header['Content-Length'] = "0" - if defaults is True and 'WARC-Payload-Digest' not in self.header: - self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - if isinstance(payload, str): payload = payload.encode() if isinstance(payload, bytes): payload = io.BytesIO(payload) + if defaults is True and 'WARC-Payload-Digest' not in self.header: + self.header['WARC-Payload-Digest'] = self._compute_digest(payload) + self.payload = payload self._content = None def _compute_digest(self, payload): - return "sha1:" + hashlib.sha1(payload.encode()).hexdigest() + return "sha1:" + hashlib.sha1(payload).hexdigest() def write_to(self, f): self.header.write_to(f) From 34c990f9fa18387b289f6c244f66f76a2647b0c2 Mon Sep 17 00:00:00 2001 From: erroneousboat Date: Mon, 27 Jul 2015 17:10:54 +0200 Subject: [PATCH 17/28] Remove reference to HTTPObject --- .gitignore | 1 + warc/__init__.py | 1 - warc/warc.py | 22 ++++++---------------- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 0a312c6..c647a60 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ docs/_build/ build/ .coverage htmlcov/ +.ropeproject/ diff --git a/warc/__init__.py b/warc/__init__.py index 68634d1..ac45bc4 100644 --- a/warc/__init__.py +++ b/warc/__init__.py @@ -9,7 +9,6 @@ from .arc import ARCFile, ARCRecord, ARCHeader from .warc import WARCFile, WARCRecord, WARCHeader, WARCReader -from .utils import HTTPObject def detect_format(filename): """Tries to figure out the type of the file. Return 'warc' for diff --git a/warc/warc.py b/warc/warc.py index 71bcc07..4012404 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -17,7 +17,7 @@ import hashlib import sys -from .utils import CaseInsensitiveDict, FilePart, HTTPObject, ContentType +from .utils import CaseInsensitiveDict, FilePart class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. @@ -150,19 +150,19 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): else: self.header['Content-Length'] = "0" + if defaults is True and 'WARC-Payload-Digest' not in self.header: + self.header['WARC-Payload-Digest'] = self._compute_digest(payload) + if isinstance(payload, str): payload = payload.encode() if isinstance(payload, bytes): payload = io.BytesIO(payload) - if defaults is True and 'WARC-Payload-Digest' not in self.header: - self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - self.payload = payload self._content = None def _compute_digest(self, payload): - return "sha1:" + hashlib.sha1(payload).hexdigest() + return "sha1:" + hashlib.sha1(payload.encode()).hexdigest() def write_to(self, f): self.header.write_to(f) @@ -171,16 +171,6 @@ def write_to(self, f): f.write(b"\r\n") f.flush() - @property - def content(self): - if self._content is None: - try: - string = self.header["content-type"] - except KeyError: - string = '' - self._content = ContentType(string) - return self._content - @property def type(self): """Record type""" @@ -258,7 +248,7 @@ def from_response(response): response.raw._fp = io.BytesIO(body) # Build the payload to create warc file. - payload = status_line + "\r\n" + headers + "\r\n" + str(body) + payload = status_line + b'\r\n' + headers + b'\r\n' + body headers = { "WARC-Type": "response", From 985be9cc98e2a92e108b82dbcf06185cafcf66ab Mon Sep 17 00:00:00 2001 From: "Almer S. Tigelaar" Date: Tue, 28 Jul 2015 14:06:54 +0200 Subject: [PATCH 18/28] * warc.py: fix for creating warc files based on a requests response. The old approach assumed to much about the internals of requests and ended up with an empty body, it also mixed byte encoded content with regular python strings (the latter was the intention so I adapted the code to that). --- warc/warc.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 4012404..297d47e 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -238,21 +238,38 @@ def from_response(response): """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - - # HTTP status line, headers and body as strings + + # HTTP status line, headers as string status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) - body = http_response.read() - - # Monkey-patch the response object so that it is possible to read from it later. - response.raw._fp = io.BytesIO(body) - # Build the payload to create warc file. - payload = status_line + b'\r\n' + headers + b'\r\n' + body + # Detect character set in headers + charset_match = re.search('charset=(.*)', headers, re.IGNORECASE) + if charset_match: + charset = charset_match.group(1) + else: + charset = 'utf8' + + # Read raw response data out of request + stream = io.BytesIO() + for chunk in response.iter_content(1024): + stream.write(chunk) + + # We need to decode the content properly, we try first to + # respect the given encoding, failing that we fallback to + # utf-8, failing that we simply give up. + body_raw = stream.getvalue() + try: + body = body_raw.decode(charset) + except: + body = body_raw.decode('utf8') + # Concat into one response + payload = status_line + '\r\n' + headers + '\r\n' + body + headers = { "WARC-Type": "response", - "WARC-Target-URI": response.request.url.encode('utf-8') + "WARC-Target-URI": response.request.url } return WARCRecord(payload=payload, headers=headers) From dff7aca3eac6f25cffa1bbc954610d084acc9e1c Mon Sep 17 00:00:00 2001 From: "Almer S. Tigelaar" Date: Tue, 11 Aug 2015 16:40:49 +0200 Subject: [PATCH 19/28] * warc.py: disable encoding/decoding and simply store and work with raw bytes instead. This simplifies the code and fixes an alignment bug. It also moves the decoding responsibility the the user to the user of the library. --- warc/warc.py | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 297d47e..59b3fcd 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -137,6 +137,8 @@ class WARCRecord(object): """ def __init__(self, header=None, payload=None, headers={}, defaults=True): """Creates a new WARC record. + + @param payload must be of type 'bytes' or FilePart """ if header is None and defaults is True: @@ -149,20 +151,18 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self.header['Content-Length'] = len(payload) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - if isinstance(payload, str): - payload = payload.encode() if isinstance(payload, bytes): payload = io.BytesIO(payload) - + self.payload = payload self._content = None def _compute_digest(self, payload): - return "sha1:" + hashlib.sha1(payload.encode()).hexdigest() + return "sha1:" + hashlib.sha1(payload).hexdigest() def write_to(self, f): self.header.write_to(f) @@ -243,29 +243,16 @@ def from_response(response): status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) - # Detect character set in headers - charset_match = re.search('charset=(.*)', headers, re.IGNORECASE) - if charset_match: - charset = charset_match.group(1) - else: - charset = 'utf8' - # Read raw response data out of request stream = io.BytesIO() + stream.write(status_line.encode()) + stream.write(b'\r\n') + stream.write(http_response.msg.as_bytes()) + stream.write(b'\r\n') for chunk in response.iter_content(1024): stream.write(chunk) - # We need to decode the content properly, we try first to - # respect the given encoding, failing that we fallback to - # utf-8, failing that we simply give up. - body_raw = stream.getvalue() - try: - body = body_raw.decode(charset) - except: - body = body_raw.decode('utf8') - - # Concat into one response - payload = status_line + '\r\n' + headers + '\r\n' + body + payload = stream.getvalue() headers = { "WARC-Type": "response", From 7a5fc7de8454d3ee5c5dfe7e9750e269d1c66c10 Mon Sep 17 00:00:00 2001 From: "Almer S. Tigelaar" Date: Thu, 13 Aug 2015 21:50:27 +0200 Subject: [PATCH 20/28] * Remove outdated build link, add documentation note, add credits. --- Readme.rst | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Readme.rst b/Readme.rst index 51e9f6b..5ee4798 100644 --- a/Readme.rst +++ b/Readme.rst @@ -1,11 +1,7 @@ warc3: Python3 library to work with WARC files -============================================= +============================================== -(Note: This is a fork of the original (now dead) warc repository. This project is still in transition and probobly not stable.) - -.. image:: https://secure.travis-ci.org/anandology/warc.png?branch=master - :alt: build status - :target: http://travis-ci.org/anandology/warc +Note: This is a fork of the original (now dead) warc repository. WARC (Web ARChive) is a file format for storing web crawls. @@ -22,6 +18,9 @@ Documentation ------------- The documentation of the warc library is available at http://warc.readthedocs.org/. + +Apart from the install from pip, which will not work for this warc3 version, the +interface as described there is unchanged. License ------- @@ -29,3 +28,15 @@ License This software is licensed under GPL v2. See LICENSE_ file for details. .. LICENSE: http://github.com/internetarchive/warc/blob/master/LICENSE + +Authors +------- + +Original Python2 Versions: + Anand Chitipothu + Noufal Ibrahim + +Python3 Port: + Ryan Chartier + Jan Pieter Bruins Slot + Almer S. Tigelaar From 70aaad539df6707464154040b2e87000f2c01cdf Mon Sep 17 00:00:00 2001 From: "Almer S. Tigelaar" Date: Thu, 13 Aug 2015 21:52:00 +0200 Subject: [PATCH 21/28] * stylify --- Readme.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Readme.rst b/Readme.rst index 5ee4798..5627d82 100644 --- a/Readme.rst +++ b/Readme.rst @@ -33,10 +33,12 @@ Authors ------- Original Python2 Versions: - Anand Chitipothu - Noufal Ibrahim + +* Anand Chitipothu +* Noufal Ibrahim Python3 Port: - Ryan Chartier - Jan Pieter Bruins Slot - Almer S. Tigelaar + +* Ryan Chartier +* Jan Pieter Bruins Slot +* Almer S. Tigelaar From f89837e07862776b77549ad1ae0ad0a1b9b14ccd Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Thu, 14 Apr 2016 11:12:44 -0500 Subject: [PATCH 22/28] Fix corner case when record_length is zero --- warc/warc.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 59b3fcd..0d35b5f 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -151,13 +151,13 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self.header['Content-Length'] = len(payload) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) if isinstance(payload, bytes): payload = io.BytesIO(payload) - + self.payload = payload self._content = None @@ -238,7 +238,7 @@ def from_response(response): """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - + # HTTP status line, headers as string status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) @@ -253,7 +253,7 @@ def from_response(response): stream.write(chunk) payload = stream.getvalue() - + headers = { "WARC-Type": "response", "WARC-Target-URI": response.request.url @@ -375,7 +375,8 @@ def finish_reading_current_record(self): # consume all data from the current_payload before moving to next record self.current_payload.read() self.expect(self.current_payload.fileobj, "\r\n") - self.expect(self.current_payload.fileobj, "\r\n") + if self.current_payload.length: + self.expect(self.current_payload.fileobj, "\r\n") self.current_payload = None def read_record(self): @@ -392,6 +393,10 @@ def read_record(self): def _read_payload(self, fileobj, content_length): size = 0 + if content_length <= 0: + yield b'' + raise StopIteration + while size < content_length: chunk_size = min(1024, content_length-size) chunk = fileobj.read(chunk_size) From 142bc7a8a6a24a41df6791ca17eb83fb7f59e122 Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Fri, 22 Apr 2016 11:27:07 -0500 Subject: [PATCH 23/28] extract http headers and return a io.BytesIO as payload --- warc/__init__.py | 15 ++- warc/arc.py | 233 ++++++++++++++++++++++++++++------------------- warc/utils.py | 2 + 3 files changed, 152 insertions(+), 98 deletions(-) diff --git a/warc/__init__.py b/warc/__init__.py index ac45bc4..32a04b8 100644 --- a/warc/__init__.py +++ b/warc/__init__.py @@ -7,8 +7,9 @@ :copyright: (c) 2012 Internet Archive """ -from .arc import ARCFile, ARCRecord, ARCHeader -from .warc import WARCFile, WARCRecord, WARCHeader, WARCReader +from .arc import ARCFile +from .warc import WARCFile + def detect_format(filename): """Tries to figure out the type of the file. Return 'warc' for @@ -17,15 +18,19 @@ def detect_format(filename): if filename.endswith(".warc") or filename.endswith(".warc.gz"): return "warc" + if filename.endswith('.arc') or filename.endswith('.arc.gz'): + return 'arc' + return "unknown" -def open(filename, mode="rb", format = None): + +def open(filename, mode="rb", format=None): """Shorthand for WARCFile(filename, mode). Auto detects file and opens it. """ - if format == "auto" or format == None: + if format == "auto" or format is None: format = detect_format(filename) if format == "warc": @@ -33,4 +38,4 @@ def open(filename, mode="rb", format = None): elif format == "arc": return ARCFile(filename, mode) else: - raise IOError("Don't know how to open '%s' files"%format) + raise IOError("Don't know how to open '%s' files" % format) diff --git a/warc/arc.py b/warc/arc.py index c1438be..e25d523 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -14,8 +14,16 @@ from .utils import CaseInsensitiveDict -ARC1_HEADER_RE = re.compile('(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)') -ARC2_HEADER_RE = re.compile('(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)') + +ARC1_HEADER_RE = re.compile(('(?P\S*)\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)')) + +ARC2_HEADER_RE = re.compile(('(?P\S*)\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)')) + class ARCHeader(CaseInsensitiveDict): """ @@ -42,8 +50,10 @@ class ARCHeader(CaseInsensitiveDict): * length (length of the n/w doc in bytes) """ - def __init__(self, url = "", ip_address = "", date = "", content_type = "", - result_code = "", checksum = "", location = "", offset = "", filename = "", length = "", version = 2): + def __init__(self, url="", ip_address="", date="", + content_type="", result_code="", checksum="", + location="", offset="", filename="", length=0, + version=2): if isinstance(date, datetime.datetime): date = date.strftime("%Y%m%d%H%M%S") @@ -51,23 +61,24 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "", try: datetime.datetime.strptime(date, "%Y%m%d%H%M%S") except ValueError: - raise ValueError("Couldn't parse the date '%s' in file header"%date) + raise ValueError("Couldn't parse the date '%s' in file " + "header" % date) self.version = version - - CaseInsensitiveDict.__init__(self, - url = url, - ip_address = ip_address, - date = date, - content_type = content_type, - result_code = result_code, - checksum = checksum, - location = location, - offset = offset, - filename = filename, - length = length) - - def write_to(self, f, version = None): + super().__init__({ + 'url': url, + 'ip_address': ip_address, + 'date': date, + 'content_type': content_type, + 'result_code': result_code, + 'checksum': checksum, + 'location': location, + 'offset': offset, + 'filename': filename, + 'length': int(length), + }) + + def write_to(self, f, version=None): """ Writes out the arc header to the file like object `f`. @@ -78,22 +89,15 @@ def write_to(self, f, version = None): if not version: version = self.version if version == 1: - header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(length)s" + header = ("%(url)s %(ip_address)s %(date)s " + "%(content_type)s %(length)s") elif version == 2: - header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(result_code)s %(checksum)s %(location)s %(offset)s %(filename)s %(length)s" - - header = header%dict(url = self['url'], - ip_address = self['ip_address'], - date = self['date'], - content_type = self['content_type'], - result_code = self['result_code'], - checksum = self['checksum'], - location = self['location'], - offset = self['offset'], - filename = self['filename'], - length = self['length']) - f.write(header) + header = ("%(url)s %(ip_address)s %(date)s %(content_type)s " + "%(result_code)s %(checksum)s %(location)s %(offset)s " + "%(filename)s %(length)s") + header = header % dict(self) + f.write(header) @property def url(self): @@ -116,7 +120,7 @@ def result_code(self): return self["result_code"] @property - def checksum (self): + def checksum(self): return self["checksum"] @property @@ -142,7 +146,7 @@ def __str__(self): def __repr__(self): f = {} - for i in "url ip_address date content_typeresult_code checksum location offset filename length".split(): + for i in "url ip_address date content_type result_code checksum location offset filename length".split(): if hasattr(self,i): f[i] = getattr(self, i) s = ['%s = "%s"'%(k, v) for k,v in f.items()] @@ -151,12 +155,34 @@ def __repr__(self): class ARCRecord(object): - def __init__(self, header = None, payload = None, headers = {}, version = None): + def __init__(self, header=None, payload=None, headers={}, version=None): if not (header or headers): - raise TypeError("Can't write create an ARC1 record without a header") - self.header = header or ARCHeader(version = version, **headers) - self.payload = payload + raise TypeError("Can't write create an ARC1 record " + "without a header") + self.header = header or ARCHeader(version=version, **headers) + self.payload = io.BytesIO(payload) self.version = version + self._read_html_headers() + + def _read_html_headers(self): + line = self.payload.readline().decode('utf-8') + if not line.startswith("HTTP/1"): + self.payload.seek(0) + return + + headers = { + 'protocol': line.strip(), + } + for line in self.payload: + line = line.decode('utf-8') + if not line.strip(): + break + name, content = line.split(':', 1) + name = name.strip() + content = content.strip() + headers[name.lower()] = content + self.header['http_headers'] = headers + self.payload = io.BytesIO(self.payload.read()) @classmethod def from_string(cls, string, version): @@ -207,7 +233,7 @@ def __str__(self): class ARCFile(object): - def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=False): + def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=None): """ Initialises a file like object that can be used to read or write Arc files. Works for both version 1 or version 2. @@ -253,7 +279,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ if fileobj is None: fileobj = builtins.open(filename, mode or "rb") mode = fileobj.mode - # initialize compress based on filename, if not already specified + # initialize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): compress = True @@ -325,85 +351,106 @@ def write(self, arc_record): self.header_written = True self._write_header() arc_record.write_to(self.fileobj, self.version) - self.fileobj.write("\n") # Record separator + self.fileobj.write("\n") # Record separator def _read_file_header(self): """Reads out the file header for the arc file. If version was not provided, this will autopopulate it.""" - header = self.fileobj.readline() - payload1 = self.fileobj.readline() - payload2 = self.fileobj.readline() - version, reserved, organisation = payload1.split(None, 2) + header = self.fileobj.readline().decode('utf-8') + payload1 = self.fileobj.readline().decode('utf-8') + payload2 = self.fileobj.readline().decode('utf-8') + version, reserved, organisation = payload1.split(maxsplit=2) self.header_read = True - # print "--------------------------------------------------" - # print header,"\n", payload1, "\n", payload2,"\n" - # print "--------------------------------------------------" + version = int(version) + # print("--------------------------------------------------") + # print(header, "\n", payload1, "\n", payload2, "\n", version) + # print("--------------------------------------------------") if self.version and int(self.version) != version: - raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version)) + raise IOError("Version mismatch. Requested version was '%s' but " + "version in file was '%s'" % (self.version, version)) - if version == '1': + if version == 1: url, ip_address, date, content_type, length = header.split() - self.file_headers = {"ip_address" : ip_address, - "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), - "org" : organisation} + self.file_headers = { + "ip_address": ip_address, + "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), + "org": organisation, + "url": url, + 'content_type': content_type, + 'length': int(length), + } self.version = 1 - elif version == '2': + elif version == 2: url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split() - self.file_headers = {"ip_address" : ip_address, - "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), - "org" : organisation} + self.file_headers = { + "ip_address": ip_address, + "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), + "org": organisation, + 'url': url, + 'content_type': content_type, + 'length': int(length), + 'filename': filename, + 'location': location, + } self.version = 2 else: - raise IOError("Unknown ARC version '%s'"%version) + raise IOError("Unknown ARC version '%s'" % version) - current = len(payload1) + len(payload2) - self.file_meta = '' - while current < int(length): + length = int(length) + current_size = len(payload1 + payload2) + self.file_meta = b'' + while current_size < length: line = self.fileobj.readline() - current = current + len(line) self.file_meta = self.file_meta + line - self.fileobj.readline() # Lose the separator newline + current_size = current_size + len(line) + self.fileobj.readline() # Lose the separator newline + def _strip_initial_new_lines(self): + line = self.fileobj.readline() + while line and not line.strip(): + line = self.fileobj.readline() + return line.decode('utf-8').strip() + + def _safe_from_arcmetadata(self, line): + # JG: this block stops the header parser / reader + # from getting caught on the XML lump + # that can appear in ARC files + if line.startswith("\n"): + line = self.fileobj.readline().decode('utf-8') + line = self.fileobj.readline().decode('utf-8') + line = self.fileobj.readline().decode('utf-8') + return line.strip() + + def _read_record_header(self, line): + if self.version == 1: + arc_header_re = ARC1_HEADER_RE + elif self.version == 2: + arc_header_re = ARC2_HEADER_RE + + matches = arc_header_re.search(line) + headers = matches.groupdict() + return ARCHeader(**headers) def _read_arc_record(self): "Reads out an arc record, formats it and returns it" - #XXX:Noufal Stream payload here rather than just read it + # XXX:Noufal Stream payload here rather than just read it # r = self.fileobj.readline() # Drop the initial newline # if r == "": # return None # header = self.fileobj.readline() - # Strip the initial new lines and read first line - header = self.fileobj.readline() - while header and header.strip() == "": - header = self.fileobj.readline() - - #JG: this block stops the header parser / reader - #from getting caught on the XML lump - #that can appear in ARC files - if header.startswith("\n"): - header = self.fileobj.readline() - header = self.fileobj.readline() - header = self.fileobj.readline() - - if header == "": - return None + line = self._strip_initial_new_lines() + line = self._safe_from_arcmetadata(line) - if int(self.version) == 1: - arc_header_re = ARC1_HEADER_RE - elif int(self.version) == 2: - arc_header_re = ARC2_HEADER_RE - - matches = arc_header_re.search(header) - headers = matches.groupdict() - arc_header = ARCHeader(**headers) - - payload = self.fileobj.read(int(headers['length'])) + if not line: + return None - self.fileobj.readline() # Munge the separator newline. + header = self._read_record_header(line) + payload = self.fileobj.read(header['length']) - return ARCRecord(header = arc_header, payload = payload) + self.fileobj.readline() # Munge the separator newline. + return ARCRecord(header=header, payload=payload) def read(self): "Reads out an arc record from the file" diff --git a/warc/utils.py b/warc/utils.py index 4298fa2..d725a39 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -15,6 +15,7 @@ SEP = re.compile("[;:=]") + class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. @@ -51,6 +52,7 @@ def __iter__(self): def __len__(self): return len(self._d) + class FilePart: """File interface over a part of file. From ef20c0cbe48f1c1945000fb0fc1b0ab08476a0c6 Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Fri, 22 Apr 2016 12:17:04 -0500 Subject: [PATCH 24/28] parse http status code --- setup.py | 2 +- warc/arc.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 96d3f7c..07feba3 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="warc", - version="0.2.1", + version="0.2.2", description="Python library to work with ARC and WARC files", long_description=open('Readme.rst').read(), license='GPLv2', diff --git a/warc/arc.py b/warc/arc.py index e25d523..f85fdd6 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -154,6 +154,11 @@ def __repr__(self): return ""%s +def status_code(protocol): + http, code, text = protocol.split(' ', 2) + return int(code) + + class ARCRecord(object): def __init__(self, header=None, payload=None, headers={}, version=None): if not (header or headers): @@ -170,8 +175,10 @@ def _read_html_headers(self): self.payload.seek(0) return + line = line.strip() headers = { - 'protocol': line.strip(), + 'protocol': line, + 'status_code': status_code(line), } for line in self.payload: line = line.decode('utf-8') From fc927ce9c47806e000314acbefe627a6582e0e09 Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Fri, 22 Apr 2016 12:21:22 -0500 Subject: [PATCH 25/28] fix bug when there are only 2 valios in HTTP protocol --- warc/arc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/warc/arc.py b/warc/arc.py index f85fdd6..10325fb 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -155,7 +155,7 @@ def __repr__(self): def status_code(protocol): - http, code, text = protocol.split(' ', 2) + code = protocol.split(' ')[1] return int(code) From 152ce58ace74154585ca95f1d24311ea054758eb Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Mon, 25 Apr 2016 23:46:58 -0500 Subject: [PATCH 26/28] promote status_code to utils.py --- warc/arc.py | 11 +++-------- warc/utils.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/warc/arc.py b/warc/arc.py index 10325fb..7c58707 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -12,7 +12,7 @@ import warnings import gzip -from .utils import CaseInsensitiveDict +from .utils import CaseInsensitiveDict, status_code ARC1_HEADER_RE = re.compile(('(?P\S*)\s(?P\S*)\s(?P\S*)' @@ -154,11 +154,6 @@ def __repr__(self): return ""%s -def status_code(protocol): - code = protocol.split(' ')[1] - return int(code) - - class ARCRecord(object): def __init__(self, header=None, payload=None, headers={}, version=None): if not (header or headers): @@ -199,8 +194,8 @@ def from_string(cls, string, version): TODO: It might be best to merge this with the _read_arc_record function rather than reimplement the functionality here. """ - header, payload = string.split("\n",1) - if payload[0] == '\n': # There's an extra + header, payload = string.split("\n", 1) + if payload[0] == '\n': # There's an extra payload = payload[1:] if int(version) == 1: arc_header_re = ARC1_HEADER_RE diff --git a/warc/utils.py b/warc/utils.py index d725a39..404eff8 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -7,15 +7,39 @@ :copyright: (c) 2012 Internet Archive """ -from collections import MutableMapping, Mapping -from http.client import HTTPMessage -import email.parser -import sys +from collections import MutableMapping import re SEP = re.compile("[;:=]") +def status_code(protocol): + code = protocol.split(' ')[1] + return int(code) + + +def get_http_headers(f): + line = f.readline().decode('utf-8') + if not line.startswith("HTTP/1"): + f.seek(0) + return + + line = line.strip() + headers = { + 'protocol': line, + 'status_code': status_code(line), + } + for line in f: + line = line.decode('utf-8') + if not line.strip(): + break + name, content = line.split(':', 1) + name = name.strip() + content = content.strip() + headers[name.lower()] = content + return headers + + class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. From cf920f38d6369a9c7a770249bb94e27c4c570526 Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Mon, 25 Apr 2016 23:47:30 -0500 Subject: [PATCH 27/28] start support for 0.18 version and parse http headers --- warc/warc.py | 65 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/warc/warc.py b/warc/warc.py index 0d35b5f..a5eaae0 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -11,13 +11,12 @@ import builtins import datetime import uuid -import logging import re import io import hashlib -import sys -from .utils import CaseInsensitiveDict, FilePart +from .utils import CaseInsensitiveDict, FilePart, get_http_headers + class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. @@ -51,9 +50,9 @@ class WARCHeader(CaseInsensitiveDict): """ CONTENT_TYPES = dict(warcinfo='application/warc-fields', - response='application/http; msgtype=response', - request='application/http; msgtype=request', - metadata='application/warc-fields') + response='application/http; msgtype=response', + request='application/http; msgtype=request', + metadata='application/warc-fields') KNOWN_HEADERS = { "type": "WARC-Type", @@ -68,13 +67,14 @@ class WARCHeader(CaseInsensitiveDict): } def __init__(self, headers, defaults=False): - self.version = "WARC/1.0" super().__init__(headers) if defaults: self.init_defaults() + self.version = "WARC/%s" % self.get('warc-version', '1.0') def init_defaults(self): - """Initializes important headers to default values, if not already specified. + """Initializes important headers to default values, + if not already specified. The WARC-Record-ID header is set to a newly generated UUID. The WARC-Date header is set to the current datetime. @@ -95,7 +95,10 @@ def write_to(self, f): for name, value in self.items(): name = name.title() # Use standard forms for commonly used patterns - name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI") + name = (name.replace("Warc-", "WARC-") + .replace("-Ip-", "-IP-") + .replace("-Id", "-ID") + .replace("-Uri", "-URI")) f.write(str(name).encode()) f.write(b": ") f.write(str(value).encode()) @@ -132,6 +135,7 @@ def __str__(self): def __repr__(self): return "" % (self.type, self.record_id) + class WARCRecord(object): """The WARCRecord object represents a WARC Record. """ @@ -161,6 +165,24 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self.payload = payload self._content = None + self._custom_cases() + + def _custom_cases(self): + # TODO: this need to be done using other pattern, but first we need + # tests + if self.version == '0.18': + self._custom_0_18() + + def _custom_0_18(self): + if not self.type == 'response': + return + + if not self['content-type'].startswith('application/http'): + return + + headers = get_http_headers(self.payload) + self.header['http_headers'] = headers + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() @@ -197,6 +219,10 @@ def date(self): def checksum(self): return self.header.get('WARC-Payload-Digest') + @property + def version(self): + return self.header['warc-version'] + @property def offset(self): """Offset of this record in the warc file from which this record is read. @@ -212,7 +238,6 @@ def __getitem__(self, name): elif name in self.content: return self.content[name] - def __setitem__(self, name, value): self.header[name] = value @@ -225,7 +250,8 @@ def __str__(self): return str(f.getvalue()) def __repr__(self): - return "" % (self.type, self['WARC-Record-ID']) + return "" % (self.type, + self['WARC-Record-ID']) @staticmethod def from_response(response): @@ -240,7 +266,8 @@ def from_response(response): http_response = response.raw._original_response # HTTP status line, headers as string - status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) + status_line = "HTTP/1.1 %d %s" % (http_response.status, + http_response.reason) headers = str(http_response.msg) # Read raw response data out of request @@ -260,6 +287,7 @@ def from_response(response): } return WARCRecord(payload=payload, headers=headers) + class WARCFile: def __init__(self, filename=None, mode=None, fileobj=None, compress=None): if fileobj is None: @@ -299,7 +327,6 @@ def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - def close(self): self.fileobj.close() @@ -330,10 +357,11 @@ def tell(self): """ return self.fileobj.tell() + class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n") - SUPPORTED_VERSIONS = ["1.0"] + SUPPORTED_VERSIONS = ["1.0", "0.18"] def __init__(self, fileobj): self.fileobj = fileobj @@ -351,10 +379,12 @@ def read_header(self, fileobj): if version not in self.SUPPORTED_VERSIONS: raise IOError("Unsupported WARC version: %s" % version) - headers = {} + headers = { + 'warc-version': version, + } while True: line = fileobj.readline().decode("utf-8") - if line == "\r\n": # end of headers + if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) if not m: @@ -372,7 +402,8 @@ def expect(self, fileobj, expected_line, message=None): def finish_reading_current_record(self): # consume the footer from the previous record if self.current_payload: - # consume all data from the current_payload before moving to next record + # consume all data from the current_payload before + # moving to next record self.current_payload.read() self.expect(self.current_payload.fileobj, "\r\n") if self.current_payload.length: From 52469aab34340f9001b43269087230e9f5ea841e Mon Sep 17 00:00:00 2001 From: Daniel Robles Date: Tue, 26 Apr 2016 00:20:52 -0500 Subject: [PATCH 28/28] a little of pep8 for arc.py --- warc/arc.py | 85 +++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 42 deletions(-) diff --git a/warc/arc.py b/warc/arc.py index 7c58707..93cb9ec 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -146,12 +146,14 @@ def __str__(self): def __repr__(self): f = {} - for i in "url ip_address date content_type result_code checksum location offset filename length".split(): - if hasattr(self,i): + fields = ("url ip_address date content_type result_code checksum " + "location offset filename length".split()) + for i in fields: + if hasattr(self, i): f[i] = getattr(self, i) - s = ['%s = "%s"'%(k, v) for k,v in f.items()] + s = ['%s = "%s"' % (k, v) for k, v in f.items()] s = ", ".join(s) - return ""%s + return "" % s class ARCRecord(object): @@ -205,16 +207,19 @@ def from_string(cls, string, version): matches = arc_header_re.search(header) headers = matches.groupdict() arc_header = ARCHeader(**headers) - return cls(header = arc_header, payload = payload, version = version) + return cls(header=arc_header, payload=payload, version=version) - def write_to(self, f, version = None): + def write_to(self, f, version=None): version = version or self.version or 2 self.header.write_to(f, version) - f.write("\n") # This separates the header and the body - if isinstance(self.payload, str): #Usually used for small payloads + f.write("\n") # This separates the header and the body + # Usually used for small payloads + if isinstance(self.payload, str): f.write(self.payload) - elif hasattr(self.payload, "read"): #Used for large payloads where we give a file like object - chunk_size = 10 * 1024 * 1024 # Read 10MB by 10MB + # Used for large payloads where we give a file like object + elif hasattr(self.payload, "read"): + # Read 10MB by 10MB + chunk_size = 10 * 1024 * 1024 d = self.payload.read(chunk_size) while d: f.write(d) @@ -227,7 +232,6 @@ def __getitem__(self, name): def __setitem__(self, name, value): self.header[name] = value - def __str__(self): f = io.StringIO() self.write_to(f) @@ -235,7 +239,8 @@ def __str__(self): class ARCFile(object): - def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}, compress=None): + def __init__(self, filename=None, mode=None, fileobj=None, version=None, + file_headers=None, compress=None): """ Initialises a file like object that can be used to read or write Arc files. Works for both version 1 or version 2. @@ -297,10 +302,10 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ else: self.filename = "" - if version != None and int(version) not in (1, 2): + if version and int(version) not in (1, 2): raise TypeError("ARC version has to be 1 or 2") self.version = version - self.file_headers = file_headers + self.file_headers = file_headers or {} self.header_written = False self.header_read = False self.file_meta = '' @@ -318,32 +323,37 @@ def _write_header(self): self.file_headers['org'] = "Unknown" if "date" not in self.file_headers: now = datetime.datetime.utcnow() - warnings.warn("Using '%s' for Archiving time"%now) + warnings.warn("Using '%s' for Archiving time" % now) self.file_headers['date'] = now if "ip_address" not in self.file_headers: - warnings.warn("Using '127.0.0.1' as IP address of machine that's archiving") + warnings.warn("Using '127.0.0.1' as IP address of machine " + "that's archiving") self.file_headers['ip_address'] = "127.0.0.1" if self.version == 1: - payload = "1 0 %(org)s\nURL IP-address Archive-date Content-type Archive-length"%dict(org = self.file_headers['org']) + payload = ("1 0 %s\nURL IP-address Archive-date Content-type " + "Archive-length") % self.file_headers['org'] elif self.version == 2: - payload = "2 0 %(org)s\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length" + payload = ("2 0 %s\nURL IP-address Archive-date Content-type " + "Result-code Checksum Location Offset Filename " + "Archive-length" % self.file_headers['org']) else: - raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version) + raise IOError("Can't write an ARC file " + "with version '\"%s\"'" % self.version) fname = os.path.basename(self.filename) - header = ARCHeader(url = "filedesc://%s"%fname, - ip_address = self.file_headers['ip_address'], - date = self.file_headers['date'], - content_type = "text/plain", - length = len(payload), - result_code = "200", - checksum = "-", - location = "-", - offset = str(self.fileobj.tell()), - filename = fname) - arc_file_header_record = ARCRecord(header, payload%self.file_headers) + header = ARCHeader(url="filedesc://%s" % fname, + ip_address=self.file_headers['ip_address'], + date=self.file_headers['date'], + content_type="text/plain", + length=len(payload), + result_code="200", + checksum="-", + location="-", + offset=str(self.fileobj.tell()), + filename=fname) + arc_file_header_record = ARCRecord(header, payload % self.file_headers) arc_file_header_record.write_to(self.fileobj, self.version) - self.fileobj.write("\n") # record separator + self.fileobj.write("\n") # record separator def write(self, arc_record): "Writes out the given arc record to the file" @@ -383,7 +393,8 @@ def _read_file_header(self): } self.version = 1 elif version == 2: - url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split() + (url, ip_address, date, content_type, result_code, + checksum, location, offset, filename, length) = header.split() self.file_headers = { "ip_address": ip_address, "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), @@ -472,13 +483,3 @@ def __iter__(self): def close(self): self.fileobj.close() - - - - - - - - - -