/*
* LinkExtractor.java
*
*/
import java.net.URL;
import java.net.MalformedURLException;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
*
* @author rbolze
*/
public class TitleExtractor {
/** Creates a new instance of LinkExtractor */
public TitleExtractor() {
}
public void visit(URL url) {
try {
// extract links
Parser parser = new Parser(url.toString());
NodeList list = parser.parse(new NodeClassFilter(TitleTag.class));
System.out.println("Number of Title ="+list.size());
for (int i = 0; i < list.size(); i++) {
TitleTag tag = (TitleTag) list.elementAt(i);
System.out.println("Title of the page "+tag.getPage().getUrl());
String title = tag.getTitle();
System.out.println("is"+title);
}
} catch (ParserException e) {
System.err.println(e.getLocalizedMessage());
}
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
TitleExtractor titleExtractor = new TitleExtractor();
String startUrlString = "http://graal.ens-lyon.fr/~rbolze/linkExtrator/index.html";
try {
URL url = new URL(startUrlString);
titleExtractor.visit(url);
} catch (MalformedURLException e) {
System.err.println("invalid url : " + startUrlString);
}
}
}
|