[freenet-cvs] r13562 - trunk/freenet/src/freenet/clients/http

swatig0 at freenetproject.org swatig0 at freenetproject.org
Wed Jun 13 16:37:02 UTC 2007


Author: swatig0
Date: 2007-06-13 16:37:02 +0000 (Wed, 13 Jun 2007)
New Revision: 13562

Added:
   trunk/freenet/src/freenet/clients/http/XMLSpider.java
Log:
msg

Added: trunk/freenet/src/freenet/clients/http/XMLSpider.java
===================================================================
--- trunk/freenet/src/freenet/clients/http/XMLSpider.java	                        (rev 0)
+++ trunk/freenet/src/freenet/clients/http/XMLSpider.java	2007-06-13 16:37:02 UTC (rev 13562)
@@ -0,0 +1,1084 @@
+/* This code is part of Freenet. It is distributed under the GNU General
+ * Public License, version 2 (or at your option any later version). See
+ * http://www.gnu.org/ for further details of the GPL. */
+package freenet.clients.http;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Attr;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+
+import freenet.client.ClientMetadata;
+import freenet.client.FetchContext;
+import freenet.client.FetchException;
+import freenet.client.FetchResult;
+import freenet.client.InsertException;
+import freenet.client.async.BaseClientPutter;
+import freenet.client.async.ClientCallback;
+import freenet.client.async.ClientGetter;
+import freenet.clients.http.filter.ContentFilter;
+import freenet.clients.http.filter.FoundURICallback;
+import freenet.clients.http.filter.UnsafeContentTypeException;
+import freenet.keys.FreenetURI;
+import freenet.node.NodeClientCore;
+import freenet.node.RequestStarter;
+import freenet.oldplugins.plugin.HttpPlugin;
+import freenet.oldplugins.plugin.PluginManager;
+import freenet.support.HTMLNode;
+import freenet.support.Logger;
+import freenet.support.MultiValueTable;
+import freenet.support.api.Bucket;
+import freenet.support.api.HTTPRequest;
+/**
+ * Spider. Produces an index.
+ */
+public class XMLSpider implements HttpPlugin, ClientCallback, FoundURICallback {
+
+	long tProducedIndex;
+
+	// URIs visited, or fetching, or queued. Added once then forgotten about.
+	private final HashSet visitedURIs = new HashSet();
+	private final HashSet urisWithWords = new HashSet();
+	private final HashSet failedURIs = new HashSet();
+	private final HashSet queuedURISet = new HashSet();
+	private final LinkedList queuedURIList = new LinkedList();
+	private final HashMap runningFetchesByURI = new HashMap();
+	private final HashMap urisByWord = new HashMap();
+	private final HashMap titlesOfURIs = new HashMap();
+	private FileWriter output;
+	private FileWriter output2;
+	
+	private static final int minTimeBetweenEachIndexRewriting = 1;
+	//private static final String indexFilename = "index.xml";
+	private static final String DEFAULT_INDEX_DIR = "/home/swati/myindex/";
+	private static final int MAX_ENTRIES = 5;
+	private static final String pluginName = "XML spider";
+	
+	private static final String indexTitle= "This is an index";
+	private static final String indexOwner = "Another anonymous";
+	private static final String indexOwnerEmail = null;
+	private final HashMap sizeOfURIs = new HashMap(); /* String (URI) -> Long */
+	private final HashMap mimeOfURIs = new HashMap(); /* String (URI) -> String */
+	private final HashMap lastPositionByURI = new HashMap(); /* String (URI) -> Integer */ /* Use to determine word position on each uri */
+	private final HashMap positionsByWordByURI = new HashMap(); /* String (URI) -> HashMap (String (word) -> Integer[] (Positions)) */
+
+	// Can have many; this limit only exists to save memory.
+	private static final int maxParallelRequests = 20;
+	private int maxShownURIs = 50;
+	private HashMap urisToNumbers;
+	private NodeClientCore core;
+	private FetchContext ctx;
+	private final short PRIORITY_CLASS = RequestStarter.PREFETCH_PRIORITY_CLASS;
+	private boolean stopped = true;
+
+	private synchronized void queueURI(FreenetURI uri) {
+		//not adding the html condition
+		if ((!visitedURIs.contains(uri)) && queuedURISet.add(uri)) {
+			queuedURIList.addLast(uri);
+			visitedURIs.add(uri);
+		}
+	}
+
+	private void startSomeRequests() {
+		try{
+			Thread.sleep(30 * 1000); // Let the node start up
+		} catch (InterruptedException e){}
+		
+		FreenetURI[] initialURIs = core.bookmarkManager.getBookmarkURIs();
+		for (int i = 0; i < initialURIs.length; i++)
+		{
+		queueURI(initialURIs[i]);
+		}
+					
+		ArrayList toStart = null;
+		synchronized (this) {
+			if (stopped) {
+				return;
+			}
+			int running = runningFetchesByURI.size();
+			int queued = queuedURIList.size();
+			
+			if ((running >= maxParallelRequests) || (queued == 0))
+				return;
+			
+			toStart = new ArrayList(Math.min(maxParallelRequests - running, queued));
+			
+			for (int i = running; i < maxParallelRequests; i++) {
+				if (queuedURIList.isEmpty())
+					break;
+				FreenetURI uri = (FreenetURI) queuedURIList.removeFirst();
+				queuedURISet.remove(uri);
+				ClientGetter getter = makeGetter(uri);
+				toStart.add(getter);
+				
+			}
+			
+			for (int i = 0; i < toStart.size(); i++) {
+			ClientGetter g = (ClientGetter) toStart.get(i);
+			try {
+				runningFetchesByURI.put(g.getURI(), g);
+				g.start();
+				} catch (FetchException e) {
+					onFailure(e, g);
+				}
+		
+			}
+		}
+				
+	}
+	
+
+	private ClientGetter makeGetter(FreenetURI uri) {
+		ClientGetter g = new ClientGetter(this, core.requestStarters.chkFetchScheduler, core.requestStarters.sskFetchScheduler, uri, ctx, PRIORITY_CLASS, this, null, null);
+		return g;
+	}
+
+	public void onSuccess(FetchResult result, ClientGetter state) {
+		FreenetURI uri = state.getURI();
+		
+		synchronized (this) {
+			runningFetchesByURI.remove(uri);
+		}
+		startSomeRequests();
+		ClientMetadata cm = result.getMetadata();
+		Bucket data = result.asBucket();
+		String mimeType = cm.getMIMEType();
+		
+		sizeOfURIs.put(uri.toString(), new Long(data.size()));
+		mimeOfURIs.put(uri.toString(), mimeType);
+		
+		try {
+			ContentFilter.filter(data, ctx.bucketFactory, mimeType, uri.toURI("http://127.0.0.1:8888/"), this);
+		} catch (UnsafeContentTypeException e) {
+			return; // Ignore
+		} catch (IOException e) {
+			Logger.error(this, "Bucket error?: " + e, e);
+		} catch (URISyntaxException e) {
+			Logger.error(this, "Internal error: " + e, e);
+		} finally {
+			data.free();
+		}
+	}
+
+	public void onFailure(FetchException e, ClientGetter state) {
+		FreenetURI uri = state.getURI();
+		
+		synchronized (this) {
+			failedURIs.add(uri);
+			runningFetchesByURI.remove(uri);
+		}
+		if (e.newURI != null)
+			queueURI(e.newURI);
+		else
+			queueURI(uri);
+		startSomeRequests();
+		
+		
+	}
+
+	public void onSuccess(BaseClientPutter state) {
+		// Ignore
+	}
+
+	public void onFailure(InsertException e, BaseClientPutter state) {
+		// Ignore
+	}
+
+	public void onGeneratedURI(FreenetURI uri, BaseClientPutter state) {
+		// Ignore
+	}
+
+	public void foundURI(FreenetURI uri) {
+		queueURI(uri);
+		startSomeRequests();
+	}
+
+	public void onText(String s, String type, URI baseURI) {
+		
+		FreenetURI uri;
+		try {
+			uri = new FreenetURI(baseURI.getPath().substring(1));
+		} catch (MalformedURLException e) {
+			Logger.error(this, "Caught " + e, e);
+			return;
+		}
+		 
+		
+      
+		if((type != null) && (type.length() != 0) && type.toLowerCase().equals("title")
+		   && (s != null) && (s.length() != 0) && (s.indexOf('\n') < 0)) {
+			/* We should have a correct title */
+			titlesOfURIs.put(uri.toString(), s);
+			type = "title";
+			
+		}
+		else type = null;
+
+//			
+//		for (int i = 0; i < words.length; i++) {
+//			String word = words[i];
+//			if ((word == null) || (word.length() == 0))
+//				continue;
+//			word = word.toLowerCase();
+//			addWord(word, uri);
+//		}
+		String[] words = s.split("[^A-Za-z0-9]");
+
+		Integer lastPosition = null;
+
+		lastPosition = (Integer)lastPositionByURI.get(uri.toString());
+
+		if(lastPosition == null)
+			lastPosition = new Integer(1); /* We start to count from 1 */
+
+		for (int i = 0; i < words.length; i++) {
+			String word = words[i];
+			if ((word == null) || (word.length() == 0))
+				continue;
+			word = word.toLowerCase();
+			try{
+			if(type == null)
+				addWord(word, lastPosition.intValue() + i, uri);
+			else
+				addWord(word, -1 * (i+1), uri);
+			}
+			catch (Exception e){}
+		}
+		
+		if(type == null) {
+			lastPosition = new Integer(lastPosition.intValue() + words.length);
+			lastPositionByURI.put(uri.toString(), lastPosition);
+		}
+		
+	}
+
+	private synchronized void addWord(String word, int position,FreenetURI uri) throws Exception{
+		
+		
+		if(word.length() < 3)
+			return;
+
+
+		FreenetURI[] uris = (FreenetURI[]) urisByWord.get(word);
+
+		//Integer[] positions = (Integer[]) positionsByWordByURI.get(word);
+
+		urisWithWords.add(uri);
+
+
+		/* Word position indexation */
+		HashMap wordPositionsForOneUri = (HashMap)positionsByWordByURI.get(uri.toString()); /* For a given URI, take as key a word, and gives position */
+		
+		if(wordPositionsForOneUri == null) {
+			wordPositionsForOneUri = new HashMap();
+			wordPositionsForOneUri.put(word, new Integer[] { new Integer(position) });
+			positionsByWordByURI.put(uri.toString(), wordPositionsForOneUri);
+		} else {
+			Integer[] positions = (Integer[])wordPositionsForOneUri.get(word);
+
+			if(positions == null) {
+				positions = new Integer[] { new Integer(position) };
+				wordPositionsForOneUri.put(word, positions);
+			} else {
+				Integer[] newPositions = new Integer[positions.length + 1];
+
+				System.arraycopy(positions, 0, newPositions, 0, positions.length);
+				newPositions[positions.length] = new Integer(position);
+
+				wordPositionsForOneUri.put(word, newPositions);
+			}
+		}
+	
+		if (uris == null) {
+			urisByWord.put(word, new FreenetURI[] { uri });
+			
+		} else {
+			for (int i = 0; i < uris.length; i++) {
+				if (uris[i].equals(uri))
+					return;
+			}
+			FreenetURI[] newURIs = new FreenetURI[uris.length + 1];
+			System.arraycopy(uris, 0, newURIs, 0, uris.length);
+			newURIs[uris.length] = uri;
+			urisByWord.put(word, newURIs);
+		}
+		if (tProducedIndex + minTimeBetweenEachIndexRewriting * 10 < System.currentTimeMillis()) {
+			try {
+				produceIndex();
+				generateIndex();
+			} catch (IOException e) {
+				Logger.error(this, "Caught " + e + " while creating index", e);
+			}
+			tProducedIndex = System.currentTimeMillis();
+		}
+		
+	}
+
+	private synchronized void produceIndex() throws IOException,NoSuchAlgorithmException {
+		// Produce an index file.
+		//FileOutputStream fos = new FileOutputStream("index2_new.xml");
+		
+		//the number of bits to consider for matching 
+		int prefix = 1 ;
+	
+		if (urisByWord.isEmpty() || urisWithWords.isEmpty()) {
+			System.out.println("No URIs with words");
+			return;
+		}
+		File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+		StreamResult resultStream;
+		resultStream = new StreamResult(outputFile);
+
+		/* Initialize xml builder */
+		Document xmlDoc = null;
+		DocumentBuilderFactory xmlFactory = null;
+		DocumentBuilder xmlBuilder = null;
+		DOMImplementation impl = null;
+		Element rootElement = null;
+
+		xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+		try {
+			xmlBuilder = xmlFactory.newDocumentBuilder();
+		} catch(javax.xml.parsers.ParserConfigurationException e) {
+			/* Will (should ?) never happen */
+			Logger.error(this, "Spider: Error while initializing XML generator: "+e.toString());
+			return;
+		}
+
+
+		impl = xmlBuilder.getDOMImplementation();
+
+		/* Starting to generate index */
+
+		xmlDoc = impl.createDocument(null, "main_index", null);
+		rootElement = xmlDoc.getDocumentElement();
+
+		/* Adding header to the index */
+		Element headerElement = xmlDoc.createElement("header");
+
+		/* -> title */
+		Element subHeaderElement = xmlDoc.createElement("title");
+		Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+		
+		subHeaderElement.appendChild(subHeaderText);
+		headerElement.appendChild(subHeaderElement);
+
+		/* -> owner */
+		subHeaderElement = xmlDoc.createElement("owner");
+		subHeaderText = xmlDoc.createTextNode(indexOwner);
+		
+		subHeaderElement.appendChild(subHeaderText);
+		headerElement.appendChild(subHeaderElement);
+		
+		/* -> owner email */
+		if(indexOwnerEmail != null) {
+			subHeaderElement = xmlDoc.createElement("email");
+			subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+			
+			subHeaderElement.appendChild(subHeaderText);
+			headerElement.appendChild(subHeaderElement);
+		}
+
+		
+		String[] words = (String[]) urisByWord.keySet().toArray(new String[urisByWord.size()]);
+		Arrays.sort(words);
+		FreenetURI[] uris = (FreenetURI[]) urisWithWords.toArray(new FreenetURI[urisWithWords.size()]);
+		urisToNumbers = new HashMap();
+		Element prefixElement = xmlDoc.createElement("prefix");
+		prefixElement.setAttribute("value", prefix+"");
+		Element filesElement = xmlDoc.createElement("files"); /* filesElement != fileElement */
+
+		for (int i = 0; i < uris.length; i++) {
+			urisToNumbers.put(uris[i], new Integer(i));
+			
+			Element fileElement = xmlDoc.createElement("file");
+
+			fileElement.setAttribute("id", Integer.toString(i));
+			fileElement.setAttribute("key", uris[i].toString());
+			
+			Long size = (Long)sizeOfURIs.get(uris[i].toString());
+
+			if(size == null) {
+				Logger.error(this, "Spider: size is missing");
+			} else {
+				fileElement.setAttribute("size", size.toString());
+			}
+			fileElement.setAttribute("mime", ((String)mimeOfURIs.get(uris[i].toString())));
+
+			Element titleElement = xmlDoc.createElement("option");
+			titleElement.setAttribute("name", "title");
+			titleElement.setAttribute("value", (String)titlesOfURIs.get(uris[i].toString()));
+
+			fileElement.appendChild(titleElement);
+			filesElement.appendChild(fileElement);
+		}
+
+		
+		
+		//all index files are ready
+		/* Adding word index */
+		Element keywordsElement = xmlDoc.createElement("keywords");
+		for(int i = 0;i<16;i++){
+			generateSubIndex(DEFAULT_INDEX_DIR+"index_"+Integer.toHexString(i)+".xml");
+			Element subIndexElement = xmlDoc.createElement("subIndex");
+			if(i<=9)
+			subIndexElement.setAttribute("key",i+"");
+			else
+				subIndexElement.setAttribute("key",Integer.toHexString(i));
+			//the subindex element key will contain the bits used for matching in that subindex
+			keywordsElement.appendChild(subIndexElement);
+		}
+		
+					
+
+		// make sure that prefix is the first child of root Element
+		rootElement.appendChild(prefixElement);
+		rootElement.appendChild(headerElement);
+		
+		rootElement.appendChild(filesElement);
+		rootElement.appendChild(keywordsElement);
+
+		/* Serialization */
+		DOMSource domSource = new DOMSource(xmlDoc);
+		TransformerFactory transformFactory = TransformerFactory.newInstance();
+		Transformer serializer;
+
+		try {
+			serializer = transformFactory.newTransformer();
+		} catch(javax.xml.transform.TransformerConfigurationException e) {
+			Logger.error(this, "Spider: Error while serializing XML (transformFactory.newTransformer()): "+e.toString());
+			return;
+		}
+		
+
+		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+		serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+		
+		/* final step */
+		try {
+			serializer.transform(domSource, resultStream);
+		} catch(javax.xml.transform.TransformerException e) {
+			Logger.error(this, "Spider: Error while serializing XML (transform()): "+e.toString());
+			return;
+		}
+
+		if(Logger.shouldLog(Logger.MINOR, this))
+			Logger.minor(this, "Spider: indexes regenerated.");
+	
+	
+	//the main xml file is generated 
+	//now as each word is generated enter it into the respective subindex
+	//now the parsing will start and nodes will be added as needed 
+		
+		
+		
+
+	}
+
+	private synchronized void generateIndex() throws Exception{
+		String[] words = (String[]) urisByWord.keySet().toArray(new String[urisByWord.size()]);
+		Arrays.sort(words);
+			 
+			
+				
+		for (int i = 0; i < 100; i++) {
+		try{
+		
+		String prefix_match = getIndex(words[i]);
+
+		boolean addedWord = addWord(prefix_match,words[i]);
+
+		if(addedWord == false)
+			{
+			
+			output2 = new FileWriter(DEFAULT_INDEX_DIR+"log3",true);
+			output2.write("\naddword failes at "+words[i]+" with prefix "+prefix_match);
+			split(prefix_match);
+			regenerateIndex(prefix_match);
+			output2.write("finished splitting on prefix "+prefix_match);
+			prefix_match = getIndex(words[i]);
+			output2.write("the new prefix "+prefix_match);
+			addWord(prefix_match,words[i]);
+	
+			}
+			output2.close();
+	}
+		catch(Exception e2){ }
+		}	
+
+	
+	}
+	private void regenerateIndex(String prefix) throws Exception{
+		//redistribute the entries in prefix.xml to prefix(0-f).xml
+		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+		Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+		Element root = doc.getDocumentElement();
+		NodeList wordList = root.getElementsByTagName("word");
+		for(int i = 0;i<wordList.getLength();i++){
+			Element word = (Element)wordList.item(i);
+			String value = word.getAttribute("v");
+			String prefix_match = getIndex(value);
+			addWord(prefix_match,value);
+		}
+	}
+	private String getIndex(String word) throws Exception {
+		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+		Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+		Element root = doc.getDocumentElement();
+		Attr prefix_value = (Attr) (root.getElementsByTagName("prefix").item(0)).getAttributes().getNamedItem("value");
+		int prefix = Integer.parseInt(prefix_value.getValue()); 
+		output = new FileWriter(DEFAULT_INDEX_DIR+"logfile2",true);
+		//Element prefixNode = (Element)root.getFirstChild();
+		 output.write("\nword "+word);
+		
+		String md5 = MD5(word);
+		output.write("  md5 "+md5);
+//		NodeList KeywordsList = root.getElementsByTagName("keywords");
+		
+		//Node Keyword = KeywordsList.item(0);
+	
+		
+		NodeList subindexList = root.getElementsByTagName("subIndex");
+		String str = md5.substring(0,prefix);		
+		
+		 output.write("String "+str);
+		  output.write("\n");
+		
+		  output.close();
+		  String prefix_match = search(str,subindexList);
+
+			
+		
+		output = new FileWriter(DEFAULT_INDEX_DIR+"search",true);
+		output.write("\nPrefix returned "+prefix_match+" with md5 "+str+ " and word "+word);
+		output.close();
+			
+		
+		return prefix_match;
+	}
+	private boolean addWord(String prefix, String str) throws Exception
+	{
+		//this word has to be added to the particular subindex
+		// modify the corresponding index
+		try{
+			
+		
+			DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+			DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+			Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+			Element root = doc.getDocumentElement();
+			
+			Element entry = (Element) root.getElementsByTagName("entries").item(0);
+			
+			Attr no_entries = (Attr) entry.getAttributes().getNamedItem("value");
+			
+			
+			if(Integer.parseInt(no_entries.getValue()) >= MAX_ENTRIES) return false;
+			else
+			{
+			//increment the number of entries
+			entry.setAttribute("value",(Integer.parseInt(no_entries.getValue())+1)+"");
+			//add the entry
+			
+			Element wordElement = doc.createElement("word");
+			wordElement.setAttribute("v", str);
+
+			FreenetURI[] urisForWord = (FreenetURI[]) urisByWord.get(str);
+
+			/* URI by URI */
+			for (int j = 0; j < urisForWord.length; j++) {
+				FreenetURI uri = urisForWord[j];
+				Integer x = (Integer) urisToNumbers.get(uri);
+				
+				if (x == null) {
+					Logger.error(this, "Eh?");
+					continue;
+				}
+
+				Element uriElement = doc.createElement("file");
+				uriElement.setAttribute("id", x.toString());
+//
+//				/* Position by position */
+				HashMap positionsForGivenWord = (HashMap)positionsByWordByURI.get(uri.toString());
+				Integer[] positions = (Integer[])positionsForGivenWord.get(str);
+
+				StringBuffer positionList = new StringBuffer();
+
+				for(int k=0; k < positions.length ; k++) {
+					if(k!=0)
+						positionList.append(',');
+
+					positionList.append(positions[k].toString());
+				}
+				
+				uriElement.appendChild(doc.createTextNode(positionList.toString()));
+
+				wordElement.appendChild(uriElement);
+			}
+			Element keywordsElement = (Element) root.getElementsByTagName("keywords").item(0);
+			keywordsElement.appendChild(wordElement);
+		
+			
+			
+			DOMSource domSource = new DOMSource(doc);
+			TransformerFactory transformFactory = TransformerFactory.newInstance();
+			Transformer serializer;
+
+			
+				serializer = transformFactory.newTransformer();
+			
+				
+						
+			File outputFile = new File(DEFAULT_INDEX_DIR+"index_"+prefix+".xml");
+			StreamResult resultStream;
+			resultStream = new StreamResult(outputFile);
+
+			serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+			serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+			
+			/* final step */
+			try {
+				serializer.transform(domSource, resultStream);
+			} catch(javax.xml.transform.TransformerException e) {}
+				
+						//i.appendChild(root);
+			//c.replaceChild(root,doc.getDocumentElement());
+			
+				
+			}
+			
+			return true;	
+		}
+		
+		catch(Exception e){}
+		return false;
+	}
+	private void split(String prefix) throws Exception
+	{
+		//first we need to split the current subindex into 16 newones
+		//then read from the original one and append to the new ones
+		
+		// make the entry in the main index..
+		DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
+		DocumentBuilder docBuilder = docFactory.newDocumentBuilder();
+		Document doc = docBuilder.parse(DEFAULT_INDEX_DIR+"index.xml");
+		Element root = doc.getDocumentElement();
+		Element prefixElt =(Element) root.getElementsByTagName("prefix").item(0);
+		int prefix_current = Integer.parseInt(prefixElt.getAttribute("value"));
+		if (prefix_current <= prefix.length())
+		prefixElt.setAttribute("value", (prefix_current+1)+"");
+		
+		Element keywordElement = (Element) root.getElementsByTagName("keywords").item(0);
+		
+		NodeList subIndexElt = root.getElementsByTagName("subIndex");
+		for(int i =0;i<subIndexElt.getLength();i++)
+		{
+			Element subIndex = (Element) subIndexElt.item(i);
+			if((subIndex.getAttribute("key")).equals(prefix)) {
+				keywordElement.removeChild(subIndex);
+				break;
+			}
+		}
+		
+		for(int i = 0;i<16;i++)
+			{
+			Element subIndex = doc.createElement("subIndex");
+			generateSubIndex(DEFAULT_INDEX_DIR+"index_"+prefix+Integer.toHexString(i)+".xml");
+			subIndex.setAttribute("key",prefix.concat(Integer.toHexString(i)));
+			keywordElement.appendChild(subIndex);
+			}
+		
+		
+		DOMSource domSource = new DOMSource(doc);
+		TransformerFactory transformFactory = TransformerFactory.newInstance();
+		Transformer serializer;
+
+		
+			serializer = transformFactory.newTransformer();
+		
+			
+					
+		File outputFile = new File(DEFAULT_INDEX_DIR+"index.xml");
+		StreamResult resultStream;
+		resultStream = new StreamResult(outputFile);
+
+		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+		serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+		
+		/* final step */
+		try {
+			serializer.transform(domSource, resultStream);
+		} catch(javax.xml.transform.TransformerException e) {}
+		
+		
+	}
+	public String search(String str,NodeList list) throws Exception
+	{
+		int prefix = str.length();
+		for(int i = 0;i<list.getLength();i++){
+			Element subIndex = (Element) list.item(i);
+			String key = subIndex.getAttribute("key");
+			if(key.equals(str)) return key;
+		}
+		
+		return search(str.substring(0, prefix-1),list);
+	}
+
+//		
+//		output.close();
+//		return search(str.substring(0,prefix-1),list);	
+		
+
+	
+	public void handleGet(HTTPRequest request, ToadletContext context) throws IOException, ToadletContextClosedException {
+		String action = request.getParam("action");
+		PageMaker pageMaker = context.getPageMaker();
+		if ((action == null) || (action.length() == 0)) {
+			MultiValueTable responseHeaders = new MultiValueTable();
+			responseHeaders.put("Location", "?action=list");
+			context.sendReplyHeaders(301, "Redirect", responseHeaders, "text/html; charset=utf-8", 0);
+			return;
+		} else if ("list".equals(action)) {
+			
+			String listName = request.getParam("listName", null);
+			HTMLNode pageNode = pageMaker.getPageNode("The XML Spider", context);
+			HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+			/* create copies for multi-threaded use */
+			if (listName == null) {
+				Map runningFetches = new HashMap(runningFetchesByURI);
+				List queued = new ArrayList(queuedURIList);
+				Set visited = new HashSet(visitedURIs);
+				Set failed = new HashSet(failedURIs);
+				contentNode.addChild(createNavbar(pageMaker, runningFetches.size(), queued.size(), visited.size(), failed.size()));
+				contentNode.addChild(createAddBox(pageMaker, context));
+				contentNode.addChild(createList(pageMaker, "Running FetcheIIIs", "running", runningFetches.keySet(), maxShownURIs));
+				contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, maxShownURIs));
+				contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, maxShownURIs));
+				contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, maxShownURIs));
+			} else {
+				contentNode.addChild(createBackBox(pageMaker));
+				if ("failed".equals(listName)) {
+					Set failed = new HashSet(failedURIs);
+					contentNode.addChild(createList(pageMaker, "Failed URIs", "failed", failed, -1));	
+				} else if ("visited".equals(listName)) {
+					Set visited = new HashSet(visitedURIs);
+					contentNode.addChild(createList(pageMaker, "Visited URIs", "visited", visited, -1));
+				} else if ("queued".equals(listName)) {
+					List queued = new ArrayList(queuedURIList);
+					contentNode.addChild(createList(pageMaker, "Queued URIs", "queued", queued, -1));
+				} else if ("running".equals(listName)) {
+					Map runningFetches = new HashMap(runningFetchesByURI);
+					contentNode.addChild(createList(pageMaker, "Running Fetches", "running", runningFetches.keySet(), -1));
+				}
+			}
+			MultiValueTable responseHeaders = new MultiValueTable();
+			byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+			context.sendReplyHeaders(200, "OK", responseHeaders, "text/html; charset=utf-8", responseBytes.length);
+			context.writeData(responseBytes);
+		} else if ("add".equals(action)) {
+			String uriParam = request.getParam("key");
+			try {
+				FreenetURI uri = new FreenetURI(uriParam);
+				synchronized (this) {
+					failedURIs.remove(uri);
+					visitedURIs.remove(uri);
+				}
+				queueURI(uri);
+				startSomeRequests();
+			} catch (MalformedURLException mue1) {
+				sendSimpleResponse(context, "URL invalid", "The given URI is not valid.");
+				return;
+			}
+			MultiValueTable responseHeaders = new MultiValueTable();
+			responseHeaders.put("Location", "?action=list");
+			context.sendReplyHeaders(301, "Redirect", responseHeaders, "text/html; charset=utf-8", 0);
+			return;
+		}
+	}
+
+	/**
+	 * @see freenet.oldplugins.plugin.HttpPlugin#handlePost(freenet.clients.http.HTTPRequestImpl, freenet.clients.http.ToadletContext)
+	 */
+	public void handlePost(HTTPRequest request, ToadletContext context) throws IOException {
+	}
+	
+	private void sendSimpleResponse(ToadletContext context, String title, String message) throws ToadletContextClosedException, IOException {
+		PageMaker pageMaker = context.getPageMaker();
+		HTMLNode pageNode = pageMaker.getPageNode(title, context);
+		HTMLNode contentNode = pageMaker.getContentNode(pageNode);
+		HTMLNode infobox = contentNode.addChild(pageMaker.getInfobox("infobox-alter", title));
+		HTMLNode infoboxContent = pageMaker.getContentNode(infobox);
+		infoboxContent.addChild("#", message);
+		byte[] responseBytes = pageNode.generate().getBytes("utf-8");
+		context.sendReplyHeaders(200, "OK", new MultiValueTable(), "text/html; charset=utf-8", responseBytes.length);
+		context.writeData(responseBytes);
+	}
+	
+	private HTMLNode createBackBox(PageMaker pageMaker) {
+		HTMLNode backbox = pageMaker.getInfobox((String) null);
+		HTMLNode backContent = pageMaker.getContentNode(backbox);
+		backContent.addChild("#", "Return to the ");
+		backContent.addChild("a", "href", "?action=list", "list of all URIs");
+		backContent.addChild("#", ".");
+		return backbox;
+	}
+	
+	private HTMLNode createAddBox(PageMaker pageMaker, ToadletContext ctx) {
+		HTMLNode addBox = pageMaker.getInfobox("Add a URI");
+		HTMLNode formNode = pageMaker.getContentNode(addBox).addChild("form", new String[] { "action", "method" }, new String[] { "", "get" });
+		formNode.addChild("input", new String[] { "type", "name", "value" }, new String[] { "hidden", "action", "add" });
+		formNode.addChild("input", new String[] { "type", "size", "name", "value" }, new String[] { "text", "40", "key", "" });
+		formNode.addChild("input", new String[] { "type", "value" }, new String[] { "submit", "Add URI" });
+		return addBox;
+	}
+
+	private HTMLNode createNavbar(PageMaker pageMaker, int running, int queued, int visited, int failed) {
+		HTMLNode navbar = pageMaker.getInfobox("navbar", "Page Navigation");
+		HTMLNode list = pageMaker.getContentNode(navbar).addChild("ul");
+		list.addChild("li").addChild("a", "href", "#running", "Running (" + running + ')');
+		list.addChild("li").addChild("a", "href", "#queued", "Queued (" + queued + ')');
+		list.addChild("li").addChild("a", "href", "#visited", "Visited (" + visited + ')');
+		list.addChild("li").addChild("a", "href", "#failed", "Failed (" + failed + ')');
+		return navbar;
+	}
+
+	private HTMLNode createList(PageMaker pageMaker, String listName, String anchorName, Collection collection, int maxCount) {
+		HTMLNode listNode = new HTMLNode("div");
+		listNode.addChild("a", "name", anchorName);
+		HTMLNode listBox = pageMaker.getInfobox(listName);
+		HTMLNode listContent = pageMaker.getContentNode(listBox);
+		listNode.addChild(listBox);
+		Iterator collectionItems = collection.iterator();
+		int itemCount = 0;
+		while (collectionItems.hasNext()) {
+			FreenetURI uri = (FreenetURI) collectionItems.next();
+			listContent.addChild("#", uri.toString());
+			listContent.addChild("br");
+			if (itemCount++ == maxCount) {
+				listContent.addChild("br");
+				listContent.addChild("a", "href", "?action=list&listName=" + anchorName, "Show all\u2026");
+				break;
+			}
+		}
+		return listNode;
+	}
+
+	/**
+	 * @see freenet.oldplugins.plugin.Plugin#getPluginName()
+	 */
+	public String getPluginName() {
+		return pluginName;
+	}
+
+	/**
+	 * @see freenet.oldplugins.plugin.Plugin#setPluginManager(freenet.oldplugins.plugin.PluginManager)
+	 */
+	public void setPluginManager(PluginManager pluginManager) {
+		this.core = pluginManager.getClientCore();
+		this.ctx = core.makeClient((short) 0).getFetchContext();
+		ctx.maxSplitfileBlockRetries = 10;
+		ctx.maxNonSplitfileRetries = 10;
+		ctx.maxTempLength = 2 * 1024 * 1024;
+		ctx.maxOutputLength = 2 * 1024 * 1024;
+		tProducedIndex = System.currentTimeMillis();
+	}
+
+
+	/**
+	 * @see freenet.oldplugins.plugin.Plugin#startPlugin()
+	 */
+	public void startPlugin() {
+		stopped = false;
+		
+		Thread starterThread = new Thread("Spider Plugin Starter") {
+			public void run() {
+				startSomeRequests();
+			}
+		};
+		starterThread.setDaemon(true);
+		starterThread.start();
+	}
+
+	/**
+	 * @see freenet.oldplugins.plugin.Plugin#stopPlugin()
+	 */
+	public void stopPlugin() {
+		synchronized (this) {
+			stopped = true;
+			queuedURIList.clear();
+		}
+	}
+
+	public void onMajorProgress() {
+		// Ignore
+	}
+
+	public void onFetchable(BaseClientPutter state) {
+		// Ignore
+	}
+	private static String convertToHex(byte[] data) {
+        StringBuffer buf = new StringBuffer();
+        for (int i = 0; i < data.length; i++) {
+        	int halfbyte = (data[i] >>> 4) & 0x0F;
+        	int two_halfs = 0;
+        	do {
+	        	if ((0 <= halfbyte) && (halfbyte <= 9))
+	                buf.append((char) ('0' + halfbyte));
+	            else
+	            	buf.append((char) ('a' + (halfbyte - 10)));
+	        	halfbyte = data[i] & 0x0F;
+        	} while(two_halfs++ < 1);
+        }
+        return buf.toString();
+    }
+	//this function will return the String representation of the MD5 hash for the input string 
+	public static String MD5(String text) throws NoSuchAlgorithmException, UnsupportedEncodingException  {
+		MessageDigest md;
+		md = MessageDigest.getInstance("MD5");
+		byte[] md5hash = new byte[32];
+		md.update(text.getBytes("iso-8859-1"), 0, text.length());
+		md5hash = md.digest();
+		return convertToHex(md5hash);
+	}
+	
+	public void generateSubIndex(String filename){
+//generates the new subIndex
+		File outputFile = new File(filename);
+		StreamResult resultStream;
+		resultStream = new StreamResult(outputFile);
+
+		/* Initialize xml builder */
+		Document xmlDoc = null;
+		DocumentBuilderFactory xmlFactory = null;
+		DocumentBuilder xmlBuilder = null;
+		DOMImplementation impl = null;
+		Element rootElement = null;
+
+		xmlFactory = DocumentBuilderFactory.newInstance();
+
+
+		try {
+			xmlBuilder = xmlFactory.newDocumentBuilder();
+		} catch(javax.xml.parsers.ParserConfigurationException e) {
+			/* Will (should ?) never happen */
+			Logger.error(this, "Spider: Error while initializing XML generator: "+e.toString());
+			return;
+		}
+
+
+		impl = xmlBuilder.getDOMImplementation();
+
+		/* Starting to generate index */
+
+		xmlDoc = impl.createDocument(null, "sub_index", null);
+		rootElement = xmlDoc.getDocumentElement();
+
+		/* Adding header to the index */
+		Element headerElement = xmlDoc.createElement("header");
+
+		/* -> title */
+		Element subHeaderElement = xmlDoc.createElement("title");
+		Text subHeaderText = xmlDoc.createTextNode(indexTitle);
+		
+		subHeaderElement.appendChild(subHeaderText);
+		headerElement.appendChild(subHeaderElement);
+
+		/* -> owner */
+		subHeaderElement = xmlDoc.createElement("owner");
+		subHeaderText = xmlDoc.createTextNode(indexOwner);
+		
+		subHeaderElement.appendChild(subHeaderText);
+		headerElement.appendChild(subHeaderElement);
+		
+	
+		/* -> owner email */
+		if(indexOwnerEmail != null) {
+			subHeaderElement = xmlDoc.createElement("email");
+			subHeaderText = xmlDoc.createTextNode(indexOwnerEmail);
+			
+			subHeaderElement.appendChild(subHeaderText);
+			headerElement.appendChild(subHeaderElement);
+		}
+
+		
+		Element filesElement = xmlDoc.createElement("files"); /* filesElement != fileElement */
+
+		Element EntriesElement = xmlDoc.createElement("entries");
+		EntriesElement.setNodeValue("0");
+		EntriesElement.setAttribute("value", "0");
+		//all index files are ready
+		/* Adding word index */
+		Element keywordsElement = xmlDoc.createElement("keywords");
+		
+		rootElement.appendChild(EntriesElement);
+		rootElement.appendChild(headerElement);
+		rootElement.appendChild(filesElement);
+		rootElement.appendChild(keywordsElement);
+
+		/* Serialization */
+		DOMSource domSource = new DOMSource(xmlDoc);
+		TransformerFactory transformFactory = TransformerFactory.newInstance();
+		Transformer serializer;
+
+		try {
+			serializer = transformFactory.newTransformer();
+		} catch(javax.xml.transform.TransformerConfigurationException e) {
+			Logger.error(this, "Spider: Error while serializing XML (transformFactory.newTransformer()): "+e.toString());
+			return;
+		}
+
+
+		serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+		serializer.setOutputProperty(OutputKeys.INDENT,"yes");
+		
+		/* final step */
+		try {
+			serializer.transform(domSource, resultStream);
+		} catch(javax.xml.transform.TransformerException e) {
+			Logger.error(this, "Spider: Error while serializing XML (transform()): "+e.toString());
+			return;
+		}
+
+		if(Logger.shouldLog(Logger.MINOR, this))
+			Logger.minor(this, "Spider: indexes regenerated.");
+	}
+
+	
+	
+}




More information about the cvs mailing list