Ecosyste.ms: Awesome
An open API service indexing awesome lists of open source software.
https://github.com/saltyshiomix/nest-crawler
An easiest crawling and scraping module for NestJS
https://github.com/saltyshiomix/nest-crawler
crawler nestjs nodejs scraper typescript
Last synced: 2 months ago
JSON representation
An easiest crawling and scraping module for NestJS
- Host: GitHub
- URL: https://github.com/saltyshiomix/nest-crawler
- Owner: saltyshiomix
- License: mit
- Created: 2019-08-22T22:01:31.000Z (over 5 years ago)
- Default Branch: master
- Last Pushed: 2023-01-04T07:56:32.000Z (almost 2 years ago)
- Last Synced: 2024-10-12T05:46:49.390Z (3 months ago)
- Topics: crawler, nestjs, nodejs, scraper, typescript
- Language: TypeScript
- Homepage: https://npm.im/nest-crawler
- Size: 498 KB
- Stars: 63
- Watchers: 6
- Forks: 8
- Open Issues: 12
-
Metadata Files:
- Readme: README.md
- License: LICENSE
Awesome Lists containing this project
README
😎 nest-crawler 😎
Crawler and Scraper Module for NestJS
## Installation
```bash
$ npm install --save nest-crawler
```## Usage
First, register it in the application module so that Nest can handle dependencies:
```ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';@Module({
imports: [
NestCrawlerModule,
],
})
export class AppModule {}
```Then, just import it and use it:
**crawler.module.ts**
```ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class CrawlerModule {}
```**crawler.service.ts**
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}// scraping the specific page
public async scrape(): Promise {
interface ExampleCom {
title: string;
info: string;
content: string;
}const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
},
},
});console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: ''Example Heading
Example Paragraph
// }
}// crawling multi pages is also supported
public async crawl(): Promise {
interface HackerNewsPage {
title: string;
}const pages: HackerNewsPage[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```## Recipe
### Single Page Scraping
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}public async scrape(): Promise {
interface ExampleCom {
title: string;
info: string;
content: string;
}const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
}
},
});console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: ''Example Heading
Example Paragraph
// }
}
}
```### Multi Pages Crawling
#### You Know the target urls already
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}public async crawl(): Promise {
interface Site {
title: string;
}const sites: Site[] = await this.crawler.fetch({
target: [
'https://example1.com',
'https://example2.com',
'https://example3.com',
],
fetch: (data: any, index: number, url: string) => ({
title: 'h1',
}),
});console.log(sites);
// [
// { title: 'An easiest crawling and scraping module for NestJS' },
// { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
// { title: '[Experimental] React SSR as a view template engine' }
// ]
}
}
```#### You Don't Know the Target Urls so Want to Crawl Dynamically
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}public async crawl(): Promise {
interface Page {
title: string;
}const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
// fetch each `https://news.ycombinator.com/${x}` and scrape data
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```#### You Need to Pass Data Dynamically
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}public async crawl(): Promise {
interface Img {
src: string;
}const images: Img[] = await this.crawler.fetch({
target: {
url: 'https://some.image.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://some.image.com${x}`,
},
fetch: {
imageIds: {
listItem: 'div.image',
data: {
id: {
selector: 'div.image-wrapper',
attr: 'data-image-id',
},
},
},
},
},
// fetch each `https://some.image.com${x}`, pass data and scrape data
fetch: (data: any, index: number, url: string) => ({
src: {
convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
},
}),
});console.log(images);
// [
// { src: 'https://some.image.com/images/1.png' },
// { src: 'https://some.image.com/images/2.png' },
// ...
// ...
// { src: 'https://some.image.com/images/100.png' }
// ]
}
}
```#### Waitable (by using `puppeteer`)
```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}public async crawl(): Promise {
interface Page {
title: string;
}const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```## Related
- [@web-master/node-web-fetch](https://github.com/saltyshiomix/web-master/blob/master/packages/node-web-fetch)