Ecosyste.ms: Awesome

An open API service indexing awesome lists of open source software.

Awesome Lists | Featured Topics | Projects

https://github.com/saltyshiomix/nest-crawler

An easiest crawling and scraping module for NestJS
https://github.com/saltyshiomix/nest-crawler

crawler nestjs nodejs scraper typescript

Last synced: 2 months ago
JSON representation

An easiest crawling and scraping module for NestJS

Awesome Lists containing this project

README

        


Nest Logo


😎 nest-crawler 😎


Crawler and Scraper Module for NestJS









Package License (MIT)

## Installation

```bash
$ npm install --save nest-crawler
```

## Usage

First, register it in the application module so that Nest can handle dependencies:

```ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';

@Module({
imports: [
NestCrawlerModule,
],
})
export class AppModule {}
```

Then, just import it and use it:

**crawler.module.ts**

```ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class CrawlerModule {}
```

**crawler.service.ts**

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

// scraping the specific page
public async scrape(): Promise {
interface ExampleCom {
title: string;
info: string;
content: string;
}

const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
},
},
});

console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: '


Example Heading


Example Paragraph


'
// }
}

// crawling multi pages is also supported
public async crawl(): Promise {
interface HackerNewsPage {
title: string;
}

const pages: HackerNewsPage[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});

console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```

## Recipe

### Single Page Scraping

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

public async scrape(): Promise {
interface ExampleCom {
title: string;
info: string;
content: string;
}

const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
}
},
});

console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: '


Example Heading


Example Paragraph


'
// }
}
}
```

### Multi Pages Crawling

#### You Know the target urls already

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

public async crawl(): Promise {
interface Site {
title: string;
}

const sites: Site[] = await this.crawler.fetch({
target: [
'https://example1.com',
'https://example2.com',
'https://example3.com',
],
fetch: (data: any, index: number, url: string) => ({
title: 'h1',
}),
});

console.log(sites);
// [
// { title: 'An easiest crawling and scraping module for NestJS' },
// { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
// { title: '[Experimental] React SSR as a view template engine' }
// ]
}
}
```

#### You Don't Know the Target Urls so Want to Crawl Dynamically

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

public async crawl(): Promise {
interface Page {
title: string;
}

const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
// fetch each `https://news.ycombinator.com/${x}` and scrape data
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});

console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```

#### You Need to Pass Data Dynamically

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

public async crawl(): Promise {
interface Img {
src: string;
}

const images: Img[] = await this.crawler.fetch({
target: {
url: 'https://some.image.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://some.image.com${x}`,
},
fetch: {
imageIds: {
listItem: 'div.image',
data: {
id: {
selector: 'div.image-wrapper',
attr: 'data-image-id',
},
},
},
},
},
// fetch each `https://some.image.com${x}`, pass data and scrape data
fetch: (data: any, index: number, url: string) => ({
src: {
convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
},
}),
});

console.log(images);
// [
// { src: 'https://some.image.com/images/1.png' },
// { src: 'https://some.image.com/images/2.png' },
// ...
// ...
// { src: 'https://some.image.com/images/100.png' }
// ]
}
}
```

#### Waitable (by using `puppeteer`)

```ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}

public async crawl(): Promise {
interface Page {
title: string;
}

const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});

console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
```

## Related

- [@web-master/node-web-fetch](https://github.com/saltyshiomix/web-master/blob/master/packages/node-web-fetch)