https://github.com/lukluk/scraper-engine
Async Scraper Framework Based Nodejs
https://github.com/lukluk/scraper-engine
scraper-engine scraping
Last synced: 11 months ago
JSON representation
Async Scraper Framework Based Nodejs
- Host: GitHub
- URL: https://github.com/lukluk/scraper-engine
- Owner: lukluk
- Created: 2014-05-23T02:12:21.000Z (about 12 years ago)
- Default Branch: master
- Last Pushed: 2016-05-25T11:53:15.000Z (about 10 years ago)
- Last Synced: 2025-04-05T20:51:14.986Z (about 1 year ago)
- Topics: scraper-engine, scraping
- Language: JavaScript
- Homepage: http://scraper-engine.com
- Size: 2.85 MB
- Stars: 11
- Watchers: 2
- Forks: 7
- Open Issues: 2
-
Metadata Files:
- Readme: README.md
Awesome Lists containing this project
README
#Scraper engine
>complate solutions for build scraper API service, easy to use
version 8.0.0
output.json and output.csv
#tutorial
https://youtu.be/j_PJkSVx7n4
https://www.youtube.com/watch?v=HHP2NyEJq4w
#How to install
```
npm install scraper-engine
```
create app.js
```
var port=4000;
require('scraper-engine').start(__dirname,port);
```
```
$ node app.js
Scraper Engine Started (port 4000)...
```
and open your browser
http://localhost:4000/output.json?site=**controller**
## example controller
http://localhost:4000/output.json?site=olx
or
http://localhost:4000/output.csv?site=olx
example : Walmart Category controller
```
var S = require('string');
exports.scraper = {
name: 'OLX',
url: function (index) {
return "http://www.walmart.com/browse/toys/action-figures/4171_4172_133130?page="+index+"&cat_id=4171_4172_133130"
},
next:function($,currentindex){
if(currentindex>=5){
return false
}else{
return true
}
},
rows: function ($) {
return $('.tile-grid-unit-wrapper');
},
fields: {
title: function ($) {
return S($.find('.tile-heading').text()).trim().s;
},
price: function ($) {
return S($.find('.tile-price').text().replace('$','')).trim().s;
},
image: function ($) {
return $.find('.product-image').attr('src');
},
urlproduct: function ($) {
return $.find('.js-product-title').attr('href');
}
}
}
```
Example: Olx Controller
```
var S = require('string');
exports.scraper = {
name: 'OLX',
url: function () {
return "http://olx.co.id/all-results/q-batu-bacan/"
},
rows: function ($) {
return $('.offer');
},
fields: {
title: function ($) {
return S($.find('.link.linkWithHash').text()).trim().s;
},
price: function ($) {
return S($.find('.price').text()).trim().s;
},
image: function ($) {
return $.find('.linkWithHash img').attr('src');
}
}
}
```
#using request parameter
Example: Olx Controller part 2
```
var S = require('string');
var keyword="";
exports.scraper = {
name: 'OLX-pass-url',
url: function () {
return "http://olx.co.id/all-results/q-"+keyword+"/"
},
setup:function(req){
keyword=req.query.keyword
},
rows: function ($) {
return $('.offer');
},
fields: {
title: function ($) {
return S($.find('.link.linkWithHash').text()).trim().s;
},
price: function ($) {
return S($.find('.price').text()).trim().s;
},
image: function ($) {
return $.find('.linkWithHash img').attr('src');
}
}
}
```
#scraping all pages detail
Example: Olx Controller part 3
```
var S = require('string');
var keyword="";
exports.scraper = {
name: 'OLX-all-pages',
url: function () {
return "http://olx.co.id/all-results/q-"+keyword+"/"
},
setup:function(req){
keyword=req.query.keyword
},
list: function ($) {
var urls=[];
$('.offer').each(function(){
ulrs.push($(this).find('.link.linkWithHash').attr('href'))
})
return urls;
},
fields: {
title: function ($) {
return $.find('.offerheadinner h1').text()
},
price: function ($) {
return $.find('.pricelabel strong').text()
},
seller: function ($) {
return $.find('.userdetails .brkword').text();
}
}
}
```
#Author
##luklukaha@gmail.com