간단한 웹 스크래핑을 설정하기 위해 node.js, request 및 cheerio를 사용하는 방법

소개:

이 자습서에서는

Cheerio는 가볍고 빠르고 유연하며 사용하기 쉽습니다.

요구 사항:

이미 node.js, jQuery 및

node.js가 익숙하지 않거나 아직 설치하지 않았다면

암호:

NPM을 사용하여 필요한 모듈을 설치하려면 다음을 입력하십시오.

npm 설치 요청 응원

그러면 현재 작업 디렉토리에 모듈이 설치됩니다.

모듈을 전체적으로 설치하려면 다음을 실행하십시오.

scrape.js라는 파일을 만들고 다음 줄을 추가합니다.

var request = require('request');
var cheerio = require('cheerio');

그러면 모든 모듈 종속성이 로드됩니다.

이제 간단한 요청으로 Hacker News의 첫 페이지를 로드하고

다음 줄을 파일에 추가합니다.

request('https://news.ycombinator.com', function (error, response, html) {
  if (!error && response.statusCode == 200) {
    console.log(html);
  }
});

node scrape.js를 사용하여 스크립트를 실행하고

원하는 메타 데이터를 추출하는 방법을 알기 위해서는 다음을 알아야 합니다.

이 예에서 Ive는 Chrome에서 Hacker News를 열고 마우스 오른쪽 버튼을 클릭한 다음

웹 개발자 콘솔을 간단히 살펴본 후

요청 코드를 다음과 같이 변경하여 내 가정을 테스트할 수 있습니다.

request('https://news.ycombinator.com', function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);
    $('span.comhead').each(function(i, element){
      var a = $(this).prev();
      console.log(a.text());
    });
  }
});

예상한 대로 코드를 실행하면 30개의 타이틀 목록을 얻을 수 있습니다. 하자

request('https://news.ycombinator.com', function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);
    $('span.comhead').each(function(i, element){
      var a = $(this).prev();
      var rank = a.parent().parent().text();
      var title = a.text();
      var url = a.attr('href');
      var subtext = a.parent().parent().next().children('.subtext').children();
      var points = $(subtext).eq(0).text();
      var username = $(subtext).eq(1).text();
      var comments = $(subtext).eq(2).text();
      // Our parsed meta data object
      var metadata = {
        rank: parseInt(rank),
        title: title,
        url: url,
        points: parseInt(points),
        username: username,
        comments: parseInt(comments)
      };
      console.log(metadata);
    });
  }
});

다음은 추가된 코드가 수행하는 작업에 대한 개요입니다.

이전 요소를 선택합니다.

var a = $(this).prev();

"a" 요소보다 두 수준 위의 요소를 구문 분석하여 순위를 가져옵니다.

var rank = a.parent().parent().text();

링크 제목을 구문 분석합니다.

var title = a.text();

"a" 요소에서 href 특성을 구문 분석합니다.

var url = a.attr('href');

HTML 테이블의 다음 행에서 하위 텍스트를 가져옵니다.

var subtext = a.parent().parent().next().children('.subtext').children();

하위 항목에서 관련 데이터를 추출합니다.

var points = $(subtext).eq(0).text();
var username = $(subtext).eq(1).text();
var comments = $(subtext).eq(2).text();

수정된 스크립트를 실행하면 다음과 같은 객체 배열이 출력됩니다.

[ { rank: 1,
    title: 'The Meteoric Rise of DigitalOcean ',
    url: 'http://news.netcraft.com/archives/2013/06/13/the-meteoric-rise-of-digitalocean.html',
    points: 240,
    username: 'beigeotter',
    comments: 163 },
  { rank: 2,
    title: 'Introducing Private Networking',
    url: 'https://linux-console.net/blog_posts/introducing-private-networking',
    points: 172,
    username: 'Goranek',
    comments: 75 },
...

그게 다야! 이제 추출된 데이터를 저장할 수 있습니다.

var request = require('request');
var cheerio = require('cheerio');

request('https://news.ycombinator.com', function (error, response, html) {
  if (!error && response.statusCode == 200) {
    var $ = cheerio.load(html);
    var parsedResults = [];
    $('span.comhead').each(function(i, element){
      // Select the previous element
      var a = $(this).prev();
      // Get the rank by parsing the element two levels above the "a" element
      var rank = a.parent().parent().text();
      // Parse the link title
      var title = a.text();
      // Parse the href attribute from the "a" element
      var url = a.attr('href');
      // Get the subtext children from the next row in the HTML table.
      var subtext = a.parent().parent().next().children('.subtext').children();
      // Extract the relevant data from the children
      var points = $(subtext).eq(0).text();
      var username = $(subtext).eq(1).text();
      var comments = $(subtext).eq(2).text();
      // Our parsed meta data object
      var metadata = {
        rank: parseInt(rank),
        title: title,
        url: url,
        points: parseInt(points),
        username: username,
        comments: parseInt(comments)
      };
      // Push meta-data into parsedResults array
      parsedResults.push(metadata);
    });
    // Log our finished parse results in the terminal
    console.log(parsedResults);
  }
});