PROJ::UBENDO::ML_SCRAPPING

In this example below you will see how to do a PROJ::UBENDO::ML_SCRAPPING with some HTML / CSS and Javascript

Thumbnail
This awesome code was written by n0xd3v, you can see more from this user in the personal repository.
You can find the original code on Codepen.io
Copyright n0xd3v ©

Technologies

  • HTML
  • CSS
  • JavaScript
<!DOCTYPE html>
<html lang="en" >

<head>
  <meta charset="UTF-8">
  <title>PROJ::UBENDO::ML_SCRAPPING</title>
  
  
  <link rel='stylesheet prefetch' href='https://bootswatch.com/slate/bootstrap.min.css'>
<link rel='stylesheet prefetch' href='https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css'>

      <link rel="stylesheet" href="css/style.css">

  
</head>

<body>

  <div class="container-fluid">
  <div class="row">
    <div class="col-md-12">
      <div class="well well-sm">
        <h3>Extract Products</h3>
      </div>
    </div>
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-body">
          <div class="input-group">
             <input id="input-url" type="text" class="form-control" placeholder="Type URL..." />
             <span class="input-group-btn">
                  <button id="btn-exec" class="btn btn-default" type="button">Go!</button>
             </span>
          </div>
        </div>
      </div>    
    </div>        
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-heading">Process</div>
        <div id="content" class="panel-body">
          <div id="alert-ajax"></div>
          <table id="tb-process" class="table">
            <thead><tr><th>#</th><th>PROCESS</th></tr></thead>
            <tbody></tbody>
          </table>      
        </div>
      </div>    
    </div>
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-heading">Products</div>
        <div id="content-2" class="panel-body">
          <table id="tb-prod" class="table table-striped">
            <thead>
              <tr>
                <th>#</th>
                <th>ID</th>
                <th>title</th>
                <th>price</th>
                <th>condition</th>
                <th>variation1</th>
                <th>variation2</th>
                <th>images_url</th>                
              </tr>
            </thead>
            <tbody></tbody>
          </table>      
        </div>
      </div>    
    </div>    
  </div>
</div>
  <script src='https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.0/jquery.min.js'></script>

  

    <script  src="js/index.js"></script>




</body>

</html>

/*Downloaded from https://www.codeseek.co/n0xd3v/projubendoml_scrapping-aBbXKq */
body {margin-top: 9px;}
.well, .panel {box-shadow: 0px 0px 1px 6px rgba(0,0,0,0.1);}
.well h3, .panel-heading {text-align: center;}

/*Downloaded from https://www.codeseek.co/n0xd3v/projubendoml_scrapping-aBbXKq */
$('#input-url').val('http://listado.mercadolibre.com.ve/_CustId_209911503');
//predefined value
var cors = 'https://crossorigin.me/';
var dfd = $.Deferred();
var dfdNext = dfd;
var dfdNew = $.Deferred();
var dfdNewNext = dfdNew;
var requestAjax = function (value) {
  return $.get(cors + value);
};
var arrayNavLinks = [];
var arrayUrlLinks = [];
var tbIndex = 0;
//functions
var getMLNav = function(url){
  console.log("init: getMLNav");
  
  var dfd1 = $.Deferred();

  requestAjax(cors + url).then(
    function(data){
      var nav_links = $(data).find('.ch-pagination li a');
      //check if pagination exists
      if($(nav_links).length >0)
      {
        $(nav_links).each(function(i,e){
          arrayNavLinks.push( $(this).attr('href') );
        });
      }
      //adjust array
      arrayNavLinks = arrayNavLinks.slice(1, arrayNavLinks.length -1);
      arrayNavLinks.unshift(url);
      dfd1.resolve(arrayNavLinks);
  },
    function(jqXHR, textStatus, errorThrown){
      dfd1.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );
  
  return dfd1.promise();

}
//process-2
var getMLUrl = function(url){
  console.log("init: getMLUrl");
  var dfd2 = $.Deferred();
  
  requestAjax(cors + url).then(
    function(data){
      var result = $(data).find('#searchResults').html();
      var allLinks = $( result ).find('div.images-viewer');
      
      $(allLinks).each(function(i,v){
        arrayUrlLinks.push( $( this ).attr('item-url') );
      });
      dfd2.resolve(arrayUrlLinks);
      //dfd2.resolve();      
    },
    function(jqXHR, textStatus, errorThrown){
      dfd2.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );
  
  return dfd2.promise();
}
//process-3
var getMLInfo = function(url){
  console.log("init: getMLInfo");
  var dfd3 = $.Deferred(); 
  
  requestAjax(cors + url).then(
    function(data){
      var product = $(data).find('.short-description');
      var rowIndex = '<td>'+tbIndex+'</td>';
      var prodId = '<td>'+$(product).find('input[name="item_id"]').val()+'</td>';
      var title = '<td>'+$(product).find('.vip-title-main').text()+'</td>';
      var price = '<td>'+$(product).find('.vip-price').text()+'</td>';
      var condition = '<td>'+$(product).find('.item-conditions dd:first').text()+'</td>';
      var color = '<td>'+$(product).find('.variations-title:eq(0)').text()+'</td>';
      //get sizes
      var sizesHTML = $(product).find('.vip-variation-list li');
      var allSizes = [];
      $(sizesHTML).each(function(i){
        allSizes.push($(this).text().replace(/\s+/g, ''));
      })
      allSizes = allSizes.join(",");
      var sizes = '<td>'+allSizes.substring(1)+'</td>';
      //get gallery
      var gallery = $(product).find('.gallery-content img');
      var allImages = [];
      $(gallery).each(function(i){
        allImages.push($(this).attr('src'));
      })
      var imgs = '<td>'+allImages.join(", ")+'</td>';
      //create table row
      var row = rowIndex + prodId + title + price + condition + color + sizes + imgs;
      $('#tb-prod').append('<tr>' + row + '</tr>');
      
      dfd3.resolve(tbIndex);
      tbIndex++;      
    },
    function(jqXHR, textStatus, errorThrown){
      dfd3.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );

  return dfd3.promise();  
}
//button action
$(document).on('click', '#btn-exec', function() {
  var url = $('#input-url').val();
  
  //process-1: get links from nav element
  getMLNav(url).then(    
    //process-2: if then true > get product links from list of items    
    function(nav){
      //this force async to sequential ajax calls
      dfd.resolve();
      for (x = 0; x < nav.length; x++) {
        var value = nav[x];

        dfdNext = dfdNext.pipe(function () {
          return getMLUrl(value).then(
            function(response) {
              //console.log(response.length);
            },
            function(response){
              console.log('getMLUrl: ' + response);
            }
          );
        });    
      }
      //process-3: when done > get products info
      dfdNext.done(function(){        
        //AGAIN :/
        //this force async to sequential ajax calls
        dfdNew.resolve();
        for (z = 0; z < arrayUrlLinks.length; z++) {
          var value2 = arrayUrlLinks[z];
          console.log(value2);
          
          dfdNewNext = dfdNewNext.pipe(function () {
            return getMLInfo(value2).then(
              function(response) {
                console.log(response);
              },
              function(response){
                console.log('getMLInfo: ' + response);
              }
            );
          });    
        }        
      });
    }, 
    function(v){ console.log('getMLNav: ' + v); }
  );

  
});

Comments