Pages

Tuesday 23 July 2013

Nutch Apostrophe/Single Quotes Issue (Solved) - Java Program

Nutch was designed to run on Unix and the filepath rules on Unix differ to that of Windows.  Considering we are running Nutch on Windows, through cygwin, it has a problem when it tries to get a filename with an apostrophe in it and this can mean many files on a system can go un-crawled.  To combat the issue, you need to run a crawler over the fileshare first and any file containing an apostrophe must be named in the nutch.txt file. For some reason, when it is given the exact filename it has no problem with apostrophes so as long as all filenames containing apostrophes are declared in the nutch.txt file then Nutch will fetch the files without a problem

I have written a short concurrent Java program that does all the work for you, so all that you are required to do is specify the fileshare (Args[1]) and the location of your seed.txt file, in my case nutch.txt (Args[0]).


This is the code for fileCrawler.java:


package FileName;
import java.io.File;
import java.io.PrintWriter;
import java.util.ArrayList;
 
public class fileCrawler {
 
  private WorkQueue workQ;
  static int i = 0;
 public static PrintWriter out;
 
 private class Worker implements Runnable {
 
  private WorkQueue queue;
 
  public Worker(WorkQueue q) {
   queue = q;
  }
 
//  since main thread has placed all directories into the workQ, we
//  know that all of them are legal directories; therefore, do not need
//  to try ... catch in the while loop below
 
  public void run() {
   String name;
   while ((name = queue.remove()) != null) {
    File file = new File(name);
    String entries[] = file.list();
    if (entries == null)
     continue;
    for (String entry : entries) {
     if (entry.compareTo(".") == 0)
      continue;
     if (entry.compareTo("..") == 0)
      continue;
     String fn = name + "/" + entry;
     if (fn.contains("'")){
      if (fn.startsWith("//")){
       out.println("file://" + fn.replaceAll(" ", "%20")); //Get rid of all spaces
       System.out.println("file://" + fn.replaceAll(" ", "%20"));
      }
      else{
       out.println("file:/" + fn.replaceAll(" ", "%20")); //Get rid of all spaces
       System.out.println("file:/" + fn.replaceAll(" ", "%20"));
      }
     }
    }
   }
  }
 }
 
 public fileCrawler() {
  workQ = new WorkQueue();
 }
 
 public Worker createWorker() {
  return new Worker(workQ);
 }
 
 
// need try ... catch below in case the directory is not legal
 
 public void processDirectory(String dir) {
  try {
 
   File file = new File(dir);
   if (file.isDirectory()) {
    String entries[] = file.list();
    if (entries != null)
     workQ.add(dir);
 
    for (String entry : entries) {
     String subdir;
     if (entry.compareTo(".") == 0)
      continue;
     if (entry.compareTo("..") == 0)
      continue;
     if (dir.endsWith("/"))
      subdir = dir+entry;
     else
      subdir = dir+"/"+entry;
     processDirectory(subdir);
    }
   }
  } catch (Exception e) {};
 }
 
 public static void main(String Args[]) {
 
  fileCrawler fc = new fileCrawler();
 
  //now start all of the worker threads
  System.out.println("Starting new File Crawler on " + Args[1]);
  int N = 5;
  ArrayList<Thread> thread = new ArrayList<Thread>(N);
  for (int i = 0; i < N; i++) {
   Thread t = new Thread(fc.createWorker());
   thread.add(t);
   t.start();
  }
 
  //File to be written to
  //@throws FileNotFoundException 
  try {
   out = new PrintWriter(Args[0]);
  }
  catch(Exception e){
   System.err.println("File Not Found: " + Args[0]);
  }
 
  //Directory to be crawled
  String a = Args[1];
  fc.processDirectory(a);
 
  //indicate that there are no more directories to add
 
  fc.workQ.finish();
 
  //Finally add the directory so that it can be crawled
  if (a.startsWith("//")){
   System.out.println("Adding: file://"+a);
   out.println("file://"+a);
  }
  else{
   System.out.println("Adding: file:/"+a);
   out.println("file:/"+a);
  }
 
  System.out.println("Closing File");
  out.close();
 
  //Kill the final threads
  for (int i = 0; i < N; i++){
   try {
    thread.get(i).join();
   } catch (Exception e) {};
  }
  System.out.println("Completed");
 }
}
This is the code for WorkQueue.java:

package FileName;
import java.util.*;
 
public class WorkQueue {
 
//
// since we are providing the concurrency control, can use non-thread-safe
// linked list
//
  private LinkedList<String> workQ;
 private boolean done;  // no more directories to be added
 private int size;  // number of directories in the queue
 
 public WorkQueue() {
  workQ = new LinkedList<String>();
  done = false;
  size = 0;
 }
 
 public synchronized void add(String s) {
  workQ.add(s);
  size++;
  notifyAll();
 }
 
 public synchronized String remove() {
  String s;
  while (!done && size == 0) {
   try {
    wait();
   } catch (Exception e) {};
  }
  if (size > 0) {
   s = workQ.remove();
   size--;
   notifyAll();
  } else
   s = null;
  return s;
 }
 
 public synchronized void finish() {
  done = true;
  notifyAll();
 }
}

No comments:

Post a Comment