Skip to content

Commit 9855e71

Browse files
committed
Add molsa call-for-bids scraping
1 parent 62cc110 commit 9855e71

File tree

2 files changed

+201
-0
lines changed

2 files changed

+201
-0
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
from hashlib import md5
2+
from dataflows import Flow, printer
3+
import requests
4+
from pyquery import PyQuery as pq
5+
6+
s = requests.session()
7+
8+
BASE = {
9+
'publisher': 'משרד העבודה, הרווחה והשירותים החברתיים',
10+
'tender_type': 'call_for_bids',
11+
'publication_id': None,
12+
}
13+
14+
headers = [
15+
('page_title', 'documents'),
16+
('target_audience', ),
17+
('start_date', ),
18+
('claim_date', ),
19+
('_', ),
20+
('decision', ),
21+
('_', 'page_url')
22+
]
23+
24+
details_headers = [
25+
('reason', '#ctl00_PlaceHolderMain_lblPubAppealSubject'),
26+
('publishing_unit', '#ctl00_PlaceHolderMain_lblPubAppealPublisherFactor'),
27+
('ordering_unit', '#ctl00_PlaceHolderMain_lblPubAppealOrderFactor'),
28+
('ordering_units', '#ctl00_PlaceHolderMain_lblPubAppealOrderFactor li'),
29+
('description', '#ctl00_PlaceHolderMain_lblPubAppealSummary'),
30+
('contact', '#ctl00_PlaceHolderMain_lblPubAppealHowToAppeal'),
31+
('contact_email', '#ctl00_PlaceHolderMain_lblPubAppealHowToAppeal a'),
32+
('required_documents', '#ctl00_PlaceHolderMain_lblPubAppealRequiredDocuments li'),
33+
('partners', '#ctl00_PlaceHolderMain_lblPubAppealMembers'),
34+
]
35+
36+
37+
def fetch_calls():
38+
URL = 'https://www.molsa.gov.il/Publications/Pages/PubAppealNewSearch.aspx'
39+
40+
catalog = pq(s.get(URL).text)
41+
for row in catalog.find('.ms-listviewtable tr'):
42+
ret = {}
43+
ret.update(BASE)
44+
ret.update(dict(
45+
(k, None)
46+
for k, *_ in details_headers
47+
))
48+
cells = pq(row).find('td')
49+
if len(cells) == len(headers):
50+
for header, cell in zip(headers, cells):
51+
cell, main, *anchor = pq(cell), *header
52+
ret[main] = cell.text()
53+
if len(anchor) > 0:
54+
a = cell.find('a')
55+
if len(a) > 0:
56+
href = pq(a).attr('href')
57+
if href.startswith('/'):
58+
href = 'https://www.molsa.gov.il{}'.format(href)
59+
ret[anchor[0]] = href
60+
yield ret
61+
62+
63+
def call_details():
64+
def func(row):
65+
details = pq(s.get(row['page_url']).text)
66+
for key, selector, *_ in details_headers:
67+
if selector.endswith('li'):
68+
row[key] = [pq(x).text() for x in details.find(selector)]
69+
else:
70+
row[key] = details.find(selector).text()
71+
return func
72+
73+
74+
def resolve_ordering_unit():
75+
def func(row):
76+
if row.get('ordering_unit') and not row.get('ordering_units'):
77+
row['ordering_units'] = [row['ordering_unit']]
78+
row['ordering_unit'] = None
79+
return func
80+
81+
82+
def fix_documents():
83+
def func(row):
84+
href = row['documents']
85+
title = row['page_title']
86+
update_date = row['start_date']
87+
row['documents'] = [
88+
dict(
89+
description=title,
90+
link=href,
91+
update_date=update_date
92+
)
93+
]
94+
return func
95+
96+
97+
def calculate_publication_id():
98+
def func(row):
99+
title_hash = int.from_bytes(
100+
md5(
101+
(row['publisher'] + row['page_title']).encode('utf8')
102+
).digest()[:4],
103+
'big'
104+
)
105+
mod = 1000000000
106+
title_hash = mod + (title_hash % mod)
107+
row['publication_id'] = title_hash
108+
return func
109+
110+
111+
def flow(*args):
112+
return Flow(
113+
fetch_calls(),
114+
call_details(),
115+
resolve_ordering_unit(),
116+
fix_documents(),
117+
calculate_publication_id(),
118+
printer()
119+
)
120+
121+
122+
if __name__ == '__main__':
123+
flow().process()
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
scraper-exemptions:
2+
schedule:
3+
crontab: "0 16 * * *"
4+
5+
pipeline:
6+
- run: add_metadata
7+
parameters:
8+
name: calls-for-bids
9+
# get the main HTML page of the exemptions search
10+
- flow: molsa
11+
runner: tzabar
12+
- run: concatenate
13+
parameters:
14+
fields:
15+
publication_id: []
16+
tender_type: []
17+
page_title: []
18+
publisher: []
19+
publishing_unit: []
20+
ordering_units: []
21+
22+
start_date: []
23+
claim_date: []
24+
25+
decision: []
26+
reason: []
27+
description: []
28+
29+
contact: []
30+
conatct_email: []
31+
32+
target_audience: []
33+
required_documents: []
34+
partners: []
35+
36+
documents: []
37+
- run: set_types
38+
parameters:
39+
types:
40+
publication_id:
41+
type: integer
42+
ordering_units:
43+
type: array
44+
es:itemType: string
45+
46+
start_date:
47+
type: date
48+
format: '%Y/%m/%d'
49+
claim_date:
50+
type: date
51+
format: '%Y/%m/%d'
52+
53+
required_documents:
54+
type: array
55+
es:itemType: string
56+
57+
documents:
58+
es:itemType: object
59+
es:schema:
60+
fields:
61+
- {name: link, type: string}
62+
- {name: description, type: string}
63+
- {name: update_time, type: string}
64+
65+
- run: set_primary_key
66+
parameters:
67+
calls_to_bids:
68+
- publication_id
69+
- run: dump_to_path
70+
parameters:
71+
out-path: /var/datapackages/procurement/calls_to_bids
72+
- run: dump_to_sql
73+
parameters:
74+
tables:
75+
calls_to_bids:
76+
resource-name: calls_to_bids
77+
mode: update
78+

0 commit comments

Comments
 (0)