PROJ::UBENDO::ML_SCRAPPING

Tutorials of (Proj::ubendo::ml_scrapping) by N0xd3v

<!DOCTYPE html>
<html >
<head>
  <meta charset="UTF-8">
  <title>PROJ::UBENDO::ML_SCRAPPING</title>
  
  
  <link rel='stylesheet prefetch' href='https://bootswatch.com/slate/bootstrap.min.css'>
<link rel='stylesheet prefetch' href='https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css'>

      <link rel="stylesheet" href="css/style.css">

  
</head>

<body>
  <div class="container-fluid">
  <div class="row">
    <div class="col-md-12">
      <div class="well well-sm">
        <h3>Extract Products</h3>
      </div>
    </div>
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-body">
          <div class="input-group">
             <input id="input-url" type="text" class="form-control" placeholder="Type URL..." />
             <span class="input-group-btn">
                  <button id="btn-exec" class="btn btn-default" type="button">Go!</button>
             </span>
          </div>
        </div>
      </div>    
    </div>        
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-heading">Process</div>
        <div id="content" class="panel-body">
          <div id="alert-ajax"></div>
          <table id="tb-process" class="table">
            <thead><tr><th>#</th><th>PROCESS</th></tr></thead>
            <tbody></tbody>
          </table>      
        </div>
      </div>    
    </div>
    <div class="col-md-12">
      <div class="panel panel-default">
        <div class="panel-heading">Products</div>
        <div id="content-2" class="panel-body">
          <table id="tb-prod" class="table table-striped">
            <thead>
              <tr>
                <th>#</th>
                <th>ID</th>
                <th>title</th>
                <th>price</th>
                <th>condition</th>
                <th>variation1</th>
                <th>variation2</th>
                <th>images_url</th>                
              </tr>
            </thead>
            <tbody></tbody>
          </table>      
        </div>
      </div>    
    </div>    
  </div>
</div>
  <script src='https://cdnjs.cloudflare.com/ajax/libs/jquery/3.1.0/jquery.min.js'></script>

    <script  src="js/index.js"></script>

</body>
</html>
/* Downloaded from https://www.codeseek.co/ */
body {margin-top: 9px;}
.well, .panel {box-shadow: 0px 0px 1px 6px rgba(0,0,0,0.1);}
.well h3, .panel-heading {text-align: center;}
/* Downloaded from https://www.codeseek.co/ */
$('#input-url').val('http://listado.mercadolibre.com.ve/_CustId_209911503');
//predefined value
var cors = 'https://crossorigin.me/';
var dfd = $.Deferred();
var dfdNext = dfd;
var dfdNew = $.Deferred();
var dfdNewNext = dfdNew;
var requestAjax = function (value) {
  return $.get(cors + value);
};
var arrayNavLinks = [];
var arrayUrlLinks = [];
var tbIndex = 0;
//functions
var getMLNav = function(url){
  console.log("init: getMLNav");
  
  var dfd1 = $.Deferred();

  requestAjax(cors + url).then(
    function(data){
      var nav_links = $(data).find('.ch-pagination li a');
      //check if pagination exists
      if($(nav_links).length >0)
      {
        $(nav_links).each(function(i,e){
          arrayNavLinks.push( $(this).attr('href') );
        });
      }
      //adjust array
      arrayNavLinks = arrayNavLinks.slice(1, arrayNavLinks.length -1);
      arrayNavLinks.unshift(url);
      dfd1.resolve(arrayNavLinks);
  },
    function(jqXHR, textStatus, errorThrown){
      dfd1.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );
  
  return dfd1.promise();

}
//process-2
var getMLUrl = function(url){
  console.log("init: getMLUrl");
  var dfd2 = $.Deferred();
  
  requestAjax(cors + url).then(
    function(data){
      var result = $(data).find('#searchResults').html();
      var allLinks = $( result ).find('div.images-viewer');
      
      $(allLinks).each(function(i,v){
        arrayUrlLinks.push( $( this ).attr('item-url') );
      });
      dfd2.resolve(arrayUrlLinks);
      //dfd2.resolve();      
    },
    function(jqXHR, textStatus, errorThrown){
      dfd2.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );
  
  return dfd2.promise();
}
//process-3
var getMLInfo = function(url){
  console.log("init: getMLInfo");
  var dfd3 = $.Deferred(); 
  
  requestAjax(cors + url).then(
    function(data){
      var product = $(data).find('.short-description');
      var rowIndex = '<td>'+tbIndex+'</td>';
      var prodId = '<td>'+$(product).find('input[name="item_id"]').val()+'</td>';
      var title = '<td>'+$(product).find('.vip-title-main').text()+'</td>';
      var price = '<td>'+$(product).find('.vip-price').text()+'</td>';
      var condition = '<td>'+$(product).find('.item-conditions dd:first').text()+'</td>';
      var color = '<td>'+$(product).find('.variations-title:eq(0)').text()+'</td>';
      //get sizes
      var sizesHTML = $(product).find('.vip-variation-list li');
      var allSizes = [];
      $(sizesHTML).each(function(i){
        allSizes.push($(this).text().replace(/\s+/g, ''));
      })
      allSizes = allSizes.join(",");
      var sizes = '<td>'+allSizes.substring(1)+'</td>';
      //get gallery
      var gallery = $(product).find('.gallery-content img');
      var allImages = [];
      $(gallery).each(function(i){
        allImages.push($(this).attr('src'));
      })
      var imgs = '<td>'+allImages.join(", ")+'</td>';
      //create table row
      var row = rowIndex + prodId + title + price + condition + color + sizes + imgs;
      $('#tb-prod').append('<tr>' + row + '</tr>');
      
      dfd3.resolve(tbIndex);
      tbIndex++;      
    },
    function(jqXHR, textStatus, errorThrown){
      dfd3.reject('Disculpe, algo fue mal. ' + textStatus + errorThrown);
    }
  );

  return dfd3.promise();  
}
//button action
$(document).on('click', '#btn-exec', function() {
  var url = $('#input-url').val();
  
  //process-1: get links from nav element
  getMLNav(url).then(    
    //process-2: if then true > get product links from list of items    
    function(nav){
      //this force async to sequential ajax calls
      dfd.resolve();
      for (x = 0; x < nav.length; x++) {
        var value = nav[x];

        dfdNext = dfdNext.pipe(function () {
          return getMLUrl(value).then(
            function(response) {
              //console.log(response.length);
            },
            function(response){
              console.log('getMLUrl: ' + response);
            }
          );
        });    
      }
      //process-3: when done > get products info
      dfdNext.done(function(){        
        //AGAIN :/
        //this force async to sequential ajax calls
        dfdNew.resolve();
        for (z = 0; z < arrayUrlLinks.length; z++) {
          var value2 = arrayUrlLinks[z];
          console.log(value2);
          
          dfdNewNext = dfdNewNext.pipe(function () {
            return getMLInfo(value2).then(
              function(response) {
                console.log(response);
              },
              function(response){
                console.log('getMLInfo: ' + response);
              }
            );
          });    
        }        
      });
    }, 
    function(v){ console.log('getMLNav: ' + v); }
  );

  
});

This awesome code ( PROJ::UBENDO::ML_SCRAPPING ) is write by n0xd3v, you can se more from this user in the personal repository

You can find the original code on Codepen.io

2018 © n0xd3v