diff --git a/code/lobbyscraper.py b/code/lobbyscraper.py
index 24d5ccd..ca09480 100644
--- a/code/lobbyscraper.py
+++ b/code/lobbyscraper.py
@@ -7,11 +7,14 @@
"""
import re
-from datetime import datetime, date, time
+from datetime import datetime
import json
import os
from bs4 import BeautifulSoup
-import urllib2
+try:
+ import urllib.request as urllib2
+except ImportError:
+ import urllib2
__author__ = "Stefan Kasberger"
__copyright__ = "Copyright 2015"
@@ -41,10 +44,10 @@ def SetupEnvironment():
def FetchHtml(url):
"""Fetches html url via urllib().
-
+
Args:
url: url to fetch
-
+
Returns:
html string as unicode
"""
@@ -54,12 +57,12 @@ def FetchHtml(url):
def FetchHtmlList(url, folder, filename):
"""Fetches html from the overview list of the lobbyingregister entries and saves it locally.
-
+
Args:
url: url to fetch
folder: to save the html
filename: filename for the html file
-
+
Returns:
html string
"""
@@ -71,11 +74,11 @@ def FetchHtmlList(url, folder, filename):
def FetchHtmlOrganisations(organisations, folder):
"""Fetches html from a lobbying-organisation and saves it locally.
-
+
Args:
organisations: dict with sequencial id's of organisations as keys.
folder: to save the html
-
+
Returns:
dict() of sequencial id's of organisations as key and html as value.
"""
@@ -88,24 +91,24 @@ def FetchHtmlOrganisations(organisations, folder):
def Save2File(data, filename):
"""Saves file locally
-
+
Args:
data: string to save
filename: name of the file
-
+
Returns:
na
"""
- text_file = open(filename, "w")
+ text_file = open(filename, "wb")
text_file.write(data.encode('utf-8'))
text_file.close()
def ReadFile(filename):
"""Reads file and returns the html.
-
+
Args:
filename: name of the file
-
+
Returns:
html from the file.
"""
@@ -115,10 +118,10 @@ def ReadFile(filename):
def ReadOrganisations(folder):
"""Reads in all html-files from the organisations folder.
-
+
Args:
folder: folder where the organisation html-files are stored.
-
+
Returns:
dict() of sequencial id's of organisations as key and html as value.
"""
@@ -129,13 +132,13 @@ def ReadOrganisations(folder):
html[int(filename.split('.')[0])] = rawHtml
return html
-def ParseList(html, timestamp):
+def ParseList(html):
"""Parses the needed facts out of the overview list html.
-
+
Args:
- html: html string
+ html: html string
timestamp: time when the html download was started.
-
+
Returns:
dict() of sequencial id's of organisations as key and dict() with facts as value.
"""
@@ -150,43 +153,49 @@ def ParseList(html, timestamp):
# assign variables from html table to dict
organisation = {}
- organisation['description'] = unicode(tds[1].string) # organisation
+ try:
+ organisation['description'] = unicode(tds[1].string) # organisation
+ except NameError:
+ organisation['description'] = tds[1].string # organisation
organisation['registry-department'] = tds[3].string # register department
organisation['url'] = BASE_URL+'/'+tds[2].a['href'] # register number url
organisation['last-update'] = str(datetime.strptime(tds[5].string, '%d.%m.%Y')) # last update
organisation['register-number'] = tds[2].string
# organisation['details'] = lxml.html.tostring(tds[4], encoding='unicode')[4:-4].split('
')[:-1] # details
-
+
lobbyList[counter] = organisation
counter += 1
return lobbyList
def ParseOrganisations(htmlList, organisations):
"""Parses the needed facts out of the organisation html.
-
+
Args:
htmlList: list() of html strings.
organisations: dict() of sequencial id's of organisations as key and dict() with facts as value.
-
+
Returns:
dict() of sequencial id's of organisations as key and dict() with facts as value.
"""
for id in organisations.keys():
soup = BeautifulSoup(htmlList[id])
- html = unicode(soup)
-
- # regex type of registry department: B, C
+ try:
+ html = unicode(soup)
+ except NameError:
+ html = str(soup)
+
+ # regex type of registry department: B, C
regDepartment = re.findall(r'Registerabteilung:\n