forked from radoraykov/rating-gov-representatives
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #48 from simov/eu-tenders
Извличане на спечелени европейски проекти/конкурси
- Loading branch information
Showing
17 changed files
with
1,045 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
|
||
## База данни | ||
```sql | ||
--- create database | ||
CREATE SCHEMA `eu-tenders` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ; | ||
--- grant access | ||
GRANT ALL ON `eu-tenders`.* TO liolio@localhost ; | ||
``` | ||
|
||
При разработката използвах _MySQL_ със [sequelize][3], конфигурацията се намира във `model/models.js` | ||
|
||
Причината да използвам реална база данни, вместо текстов файл: | ||
|
||
- гарантирам уникалност на записите създавани при _scrape_ на проект | ||
- не държа всички данни в паметта по време на изпълнение на скрипта | ||
|
||
|
||
## Команди | ||
```bash | ||
# print out all available commands | ||
node app.js -h | ||
# run all tests | ||
npm test | ||
# run a single test (npm install -g mocha) | ||
mocha -g 'scrape project page' | ||
# run the script | ||
npm start | ||
# run the admin (npm install -g express-admin) | ||
# user: admin, pass: 11aaAA | ||
admin x-admin/ | ||
``` | ||
|
||
## Модел | ||
![eu-tenders][2] | ||
|
||
|
||
## Етап 1 | ||
Обхождам [списъка с проекти][1], за да взема техните `id`та. | ||
|
||
Освен това попълвам и _Място на изпълнение_ `place`, тъй като е форматирано добре. Отделните компоненти са разделени с `;` | ||
|
||
Взимам и _Продължителност месеци_ `duration`, тъй като не намерих тази извадка на страницата за проект | ||
|
||
Този етап е бавен тъй като техният сървър прави заявка към базата за всички записи. Освен това тегленето на страници е последователно, тъй като се използва `__VIEWSTATE` от предходната. Отне 2 часа и 15 минути за 1150 заявки/страници | ||
|
||
|
||
## Етап 2 | ||
Правя заявка за всеки `project` запис в базата, за да попълня липсващата информация. | ||
|
||
Този етап е по-бърз, тъй като заявката на техният сървър отнема по-малко време, и дърпам по 5 проекта паралелно. Отне 45 минути за 11465 заявки | ||
|
||
|
||
## Изпълнение | ||
1. Създаване на базата по някакъв начин, виж командите в началото | ||
2. Инсталация | ||
```bash | ||
cd eu-tenders | ||
npm install | ||
``` | ||
3. Синхронизация на моделите с базата | ||
```bash | ||
npm app.js -s | ||
``` | ||
4. Стартиране на скрипта | ||
```bash | ||
npm start | ||
``` | ||
|
||
|
||
[1]: http://umispublic.government.bg/prProcedureProjectsInfo.aspx?op=-1&proc=-2&clear=1 | ||
[2]: http://i.imgur.com/ZqSjLA8.png | ||
[3]: https://github.com/sequelize/sequelize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
|
||
require('colors'); | ||
var spawn = require('child_process').spawn; | ||
var async = require('async'), | ||
cmd = require('commander'); | ||
|
||
var eu = require('./lib/index')(); | ||
var models = require('./model/models'); | ||
|
||
cmd.version('1.0.0') | ||
.option('-l, --list', 'Scrape projects list') | ||
.option('-p, --project', 'Scrape each individual project') | ||
.option('-t, --truncate', 'Truncate all tables') | ||
.option('-s, --sync', 'Sync all defined DAOs to the DB') | ||
.parse(process.argv); | ||
|
||
|
||
|
||
function truncate (done) { | ||
if (!cmd.truncate) return done(); | ||
async.each(['Project', 'Contractor', 'Program', 'Executors', 'Partners'], function (name, done) { | ||
models[name].destroy({}, {truncate:true}) | ||
.success(function () {done()}) | ||
.error(function (err) {console.log(err); done()}); | ||
}, done); | ||
} | ||
|
||
function done (action) { | ||
console.log(action.cyan, 'DONE!'.rainbow); | ||
process.exit(); | ||
} | ||
|
||
|
||
if (cmd.list) { | ||
eu.scrape.projects.all(done.bind(null,'list')); | ||
} | ||
else if (cmd.project) { | ||
eu.scrape.project.all(done.bind(null,'projects')); | ||
} | ||
else if (cmd.list && cmd.project) { | ||
async.series([ | ||
eu.scrape.projects.all, | ||
eu.scrape.project.all | ||
], done.bind(null,'list & projects')); | ||
} | ||
else if (cmd.truncate) { | ||
truncate(done.bind(null,'truncate')); | ||
} | ||
else if (cmd.sync) { | ||
spawn('node', ['model/models.js']).on('exit', done.bind(null,'sync')); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
exports = module.exports = function () { | ||
return { | ||
request: require('./request'), | ||
parse: require('./parse'), | ||
scrape: require('./scrape'), | ||
w: require('./write') | ||
}; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
|
||
var cheerio = require('cheerio'), | ||
moment = require('moment'); | ||
|
||
|
||
exports.projects = { | ||
total: function (html) { | ||
var $ = cheerio.load(html); | ||
|
||
return { | ||
viewstate: $('#__VIEWSTATE').val(), | ||
total: parseInt($('#ContentPlaceHolder1_lblMaxRows') | ||
.text().trim().replace('Общ брой: ','')) | ||
}; | ||
}, | ||
page: function (html) { | ||
var $ = cheerio.load(html); | ||
|
||
var projects = []; | ||
$('.InfoTableProposal tr').each(function (index) { | ||
if (!index) return; | ||
projects.push({ | ||
id: parseInt($('td',this).eq(3).find('a').attr('href').replace(/.*id=(\d+).*/,'$1')), | ||
place: $('td',this).eq(2).text().trim(), | ||
duration: parseFloat($('td',this).eq(8).text().trim()) | ||
}); | ||
}); | ||
|
||
return { | ||
viewstate: $('#__VIEWSTATE').val(), | ||
projects: projects | ||
}; | ||
} | ||
}; | ||
|
||
|
||
exports.project = { | ||
page: function (html) { | ||
var $ = cheerio.load(html); | ||
var project = {}, ref = {beneficiary:null, program:null, partners:[], executors: []}; | ||
|
||
function span (ctx, row) { | ||
return $('tr',ctx).eq(row).find('td').eq(1).find('span'); | ||
} | ||
function a (ctx, row) { | ||
return $('tr',ctx).eq(row).find('td').eq(1).find('a'); | ||
} | ||
|
||
(function identification (ctx) { | ||
project.isun = span(ctx,1).text().trim(); | ||
project.number = span(ctx,2).text().trim(); | ||
project.number = (project.number == '---') ? null : project.number; | ||
project.name = span(ctx,3).text().trim(); | ||
|
||
ref.beneficiary = { | ||
id: parseInt(a(ctx,4).attr('href').replace(/.*benef=(\d+).*/,'$1')), | ||
name: a(ctx,4).text().trim() | ||
}; | ||
project.beneficiary_id = ref.beneficiary.id; | ||
|
||
ref.program = { | ||
id: parseInt(a(ctx,5).attr('href').replace(/.*op=(\d+).*/,'$1')), | ||
name: a(ctx,5).text().trim(), | ||
source: span(ctx,5).text().trim().replace('==>','').trim() | ||
}; | ||
project.program_id = ref.program.id; | ||
|
||
project.date_contract = moment(span(ctx,6).text().trim(), 'DD.MM.YYYY').format('YYYY-MM-DD'); | ||
project.date_begin = moment(span(ctx,7).text().trim(), 'DD.MM.YYYY').format('YYYY-MM-DD'); | ||
project.date_end = moment(span(ctx,8).text().trim(), 'DD.MM.YYYY').format('YYYY-MM-DD'); | ||
|
||
project.status = span(ctx,9).text().trim(); | ||
}($('#ContentPlaceHolder1_divIdentification'))); | ||
|
||
|
||
(function description (ctx) { | ||
project.description = span(ctx,1).html().trim(); | ||
project.activities = span(ctx,2).html().trim(); | ||
}($('#ContentPlaceHolder1_divDescription'))); | ||
|
||
|
||
(function partners (ctx) { | ||
if (!$('table', ctx).length) return; | ||
$('tr', ctx).each(function (index) { | ||
ref.partners.push({ | ||
id:parseInt($('a', this).attr('href').replace(/.*benef=(\d+).*/,'$1')), | ||
name:$('a', this).text().trim() | ||
}); | ||
}); | ||
}($('#ContentPlaceHolder1_tdPartners'))); | ||
|
||
|
||
(function executors (ctx) { | ||
if (!$('table', ctx).length) return; | ||
$('tr', ctx).each(function (index) { | ||
ref.executors.push({ | ||
id:parseInt($('a', this).attr('href').replace(/.*benef=(\d+).*/,'$1')), | ||
name:$('a', this).text().trim() | ||
}); | ||
}); | ||
}($('#ContentPlaceHolder1_tdExecutors'))); | ||
|
||
|
||
(function financial (ctx) { | ||
project.budget_approved = parseInt(span(ctx,1).text().trim().replace(' BGN','').replace(' ','')||0); | ||
project.budget_total = parseInt(span(ctx,2).text().trim().replace(' BGN','').replace(' ','')||0); | ||
project.budget_bfp_total = parseInt(span(ctx,3).text().trim().replace(' BGN','').replace(' ','')||0); | ||
project.budget_paid = parseInt(span(ctx,4).text().trim().replace(' BGN','').replace(' ','')||0); | ||
|
||
var bfp = null; | ||
|
||
bfp = $('#ContentPlaceHolder1_tdBFP_EU_AssumedAmount').text().trim().replace(' ',''); | ||
project.budget_bfp_eu = bfp ? parseInt(bfp||0) : 0; | ||
|
||
bfp = $('#ContentPlaceHolder1_tdBFP_National_AssumedAmount').text().trim().replace(' ',''); | ||
project.budget_bfp_nat = bfp ? parseInt(bfp||0) : 0; | ||
|
||
bfp = $('#ContentPlaceHolder1_tdBenef_AssumedAmount').text().trim().replace(' ',''); | ||
project.budget_benef = bfp ? parseInt(bfp||0) : 0; | ||
}($('#ContentPlaceHolder1_divFinansicalInfo'))); | ||
|
||
|
||
(function indicators (ctx) { | ||
|
||
}($('#ContentPlaceHolder1_divIndicators'))); | ||
|
||
|
||
return {project:project, ref:ref}; | ||
} | ||
}; |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.