/*
 * Demonstration for HTTP access and RegExp in Java.
 */

package iis;

import java.io.*;
import java.net.*;
import java.util.regex.*;

public class WebCrawler {
	public static void main(String[] args) {
		WebCrawler wc = new WebCrawler();

		// A very basic example
		String doc = wc.getDocumentFromURL("http://www.csszengarden.com/");
		if (doc != null)
			wc.simpleExtractData(doc);

		// A more advanced example that falsifies the User-agent field
		/*
		String doc = wc.getDocumentFromURL("http://www.google.com/search?hl=en&q=music&btnG=Google+Search", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
		if (doc != null)
			wc.complexExtractData(doc);
		*/
	}

	// Extract some simple but useless data from the given HTML string
	private void simpleExtractData(String s) {
		// Capture 'div' and 'p' elements that have exactly one 'id' or 'class' attribute
		Pattern p = Pattern.compile("<(div|p) (id|class)=\"(.*?)\">");

		Matcher m = p.matcher(s);
		while (m.find())
			System.out.println("Entire pattern = '" + m.group() + "'\n\tElement = '" + m.group(1) + "'\n\tValue of attribute '" + m.group(2) + "' is '" + m.group(3) + "'");
	}

	// Extract hyperlinks from the given HTML string
	private void complexExtractData(String s) {
		// Capture links of the following two formats:
		//     <a href="xxx">...</a> and <a href=xxx>...</a>
		Pattern p = Pattern.compile("<a href=(\".*?\"|[^>]*)>(.*?)</a>");

		// Capture only result links. Note that this works as of 2005-03-03 but
		// may change at any point in time because it's Google. :-)
		//Pattern p = Pattern.compile("<a href=([^ ]+) onmousedown=\"return \\w+\\(this,'\\w+',\\d+\\)\">(.*?)</a>");

		Matcher m = p.matcher(s);
		while (m.find())
			System.out.println(m.group() + "\n\t" + m.group(1) + "\n\t" + m.group(2));
	}

	// Retrieve a document from a given URL
	private String getDocumentFromURL(String url) {
		URL u;
		try {
			u = new URL(url);
		} catch (MalformedURLException e) {
			System.err.println("ERROR: Malformed URL: " + url);
			return null;
		}

		try {
			String line;
			StringBuffer doc = new StringBuffer();
			BufferedReader br = new BufferedReader(new InputStreamReader((u.openStream())));
			while ((line = br.readLine()) != null) {
				doc.append(line + "\n");
			}
			return doc.toString();
		} catch (IOException e) {
			System.err.println("ERROR: Unable to retrieve URL: " + url);
			e.printStackTrace();
			return null;
		}
	}

	// Retrieve a document from a given URL using the given user agent string
	private String getDocumentFromURL(String url, String ua) {
		URL u;
		try {
			u = new URL(url);
		} catch (MalformedURLException e) {
			System.err.println("ERROR: Malformed URL: " + url);
			return null;
		}

		try {
			String line;
			StringBuffer doc = new StringBuffer();
			URLConnection conn = u.openConnection();
			conn.setRequestProperty("User-agent", ua);
			BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
			while ((line = br.readLine()) != null) {
				doc.append(line + "\n");
			}
			return doc.toString();
		} catch (IOException e) {
			System.err.println("ERROR: Unable to retrieve URL: " + url);
			e.printStackTrace();
			return null;
		}
	}

}

