Thursday, August 14, 2014

Extract Span/DIV/HML tag data using Jsoup

How to extract span/div/html tags data using Jsoup



import java.io.File;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@author velmurugan pousel

public class JsoupTest {

    public static void main(String[] args) {
       
        File input = new File("c:/temp/index.html");
        try {
            Document doc = Jsoup.parse(input, "UTF-8");
   
            //give only the text inside in the tag
            System.out.println(" inside text --> " + doc.select("span[class=eventProfileText]").text());
            //give all the contents from the tag
            System.out.println(" whole content -->" + doc.select("span[class=eventProfileText]").html());
           
        } catch (Throwable e){
            e.printStackTrace();
        }
               
    }
}

input: index.html

<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
 </head>
 <body class="">
  <div class="shop-section line bmargin10 tmargin10">
   <div class="price-section fksk-price-section unit">
    <div class="price-table">
     <div class="line" itemprop="offers" itemscope="" itemtype="http://schema.org/Offer">
      <div class="price-save">
       <span class="label-td"><span class="label fksk-label">Price :</span></span>
      </div>
      <span class="price final-price our fksk-our" id="fk-mprod-our-id">Rs.<span class="small-font"> </span>11990</span>
      <span class="eventProfileText">
<a href="https://www.aamc.org/meetings/123148/2011-ewim.html">Meeting Details</a>
<br/><br/>
</span>


     </div>
     <meta itemprop="price" content="Rs. 11990" />
     <meta itemprop="priceCurrency" content="INR" />
     <div class="our-price-desc fksk-our-price-desc">
      <small>(Prices are inclusive of all taxes)</small>
     </div>
    </div>
   </div>
  </div>
 </body>
</html>

output :

only text --> Meeting Details
 whole content --><a href="https://www.aamc.org/meetings/123148/2011-ewim.html">Meeting Details</a> <br /><br />



No comments:

Post a Comment