*** empty log message ***
authorhkhang <hkhang>
Thu, 29 Oct 2009 20:23:57 +0000 (20:23 +0000)
committerhkhang <hkhang>
Thu, 29 Oct 2009 20:23:57 +0000 (20:23 +0000)
GlobalQuery.java [new file with mode: 0644]
LocalQuery.java [new file with mode: 0644]
QueryList.java [new file with mode: 0644]
QueryQueue.java [new file with mode: 0644]
QueryTask.java [new file with mode: 0644]
Spider.java [new file with mode: 0644]
dstm.conf [new file with mode: 0644]

diff --git a/GlobalQuery.java b/GlobalQuery.java
new file mode 100644 (file)
index 0000000..0a9d163
--- /dev/null
@@ -0,0 +1,92 @@
+public class GlobalQuery {
+  GlobalString hostname;
+  GlobalString path;
+       int depth;
+  
+       public GlobalQuery(GlobalString hostname) {
+               this.hostname = global new GlobalString(hostname);
+               this.path = global new GlobalString("");
+               this.depth = 0;
+       }
+
+  public GlobalQuery(GlobalString hostname, GlobalString path, int depth) {
+    this.hostname = global new GlobalString(hostname);
+    this.path = global new GlobalString(path);
+               this.depth = depth;
+  }
+
+       public int getDepth() {
+               return depth;
+       }
+       
+  public GlobalString getHostName() {
+    return hostname;
+  }
+  public GlobalString getPath() {
+    return path;
+  }
+
+  public GlobalString makewebcanonical(GlobalString page) {
+    GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page));
+    b.append("/");
+               b.append(getPathName(page));
+    return b.toGlobalString();
+       }
+
+  public GlobalString getHostName(GlobalString page) {
+    GlobalString http = global new GlobalString("http://");
+    GlobalString https = global new GlobalString("https://");
+               int beginindex;
+               int endindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+                       return getHostName();
+               } 
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
+               }
+         endindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex == -1)) {       
+                       System.printString("ERROR");
+               }
+               if (endindex == -1)
+                       endindex = page.length();
+
+               return page.subString(beginindex, endindex);
+  }
+
+  
+       public GlobalString getPathName(GlobalString page) {
+    GlobalString http = global new GlobalString("http://");
+    GlobalString https = global new GlobalString("https://");
+               int beginindex;
+               int nextindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+      GlobalString path = getPath();
+           int lastindex = path.lastindexOf('/');
+           if (lastindex == -1)
+        return page;
+           
+      GlobalStringBuffer sb = global new GlobalStringBuffer(path.subString(0,lastindex+1));
+           sb.append(page);
+      return sb.toGlobalString();
+    } 
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
+               }
+               nextindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex == -1) || (nextindex == -1))
+                       return global new GlobalString("index.html");
+               return page.subString(nextindex+1, page.length());
+  }
+}
diff --git a/LocalQuery.java b/LocalQuery.java
new file mode 100644 (file)
index 0000000..1beeadb
--- /dev/null
@@ -0,0 +1,95 @@
+public class LocalQuery {
+  String hostname;
+  String path;
+       StringBuffer response;
+       int depth;
+  
+  public LocalQuery(String hostname, String path, int depth) {
+    this.hostname = new String(hostname);
+    this.path = new String(path);
+               response = new StringBuffer();
+               this.depth = depth;
+  }
+
+       public int getDepth() {
+               return depth;
+       }
+       
+  public String getHostName() {
+    return hostname;
+  }
+  public String getPath() {
+    return path;
+  }
+
+  public void outputFile() {
+               StringBuffer sb = new StringBuffer(hostname);
+               sb.append(path);
+    FileOutputStream fos = new FileOutputStream(sb.toString().replace('/','#'));
+    fos.write(response.toString().getBytes());
+    fos.close();
+  }
+
+  public String makewebcanonical(String page) {
+    StringBuffer b = new StringBuffer(getHostName(page));
+    b.append("/");
+               b.append(getPathName(page));
+    return b.toString();
+  }
+
+       public String getHostName(String page) {
+               String http = new String("http://");
+               String https = new String("https://");
+               int beginindex;
+               int endindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+                       return getHostName();
+               } 
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
+               }
+               endindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex == -1)) {
+                       System.printString("ERROR");
+               }
+               if (endindex == -1)
+                       endindex = page.length();
+
+               return page.subString(beginindex, endindex);
+       }
+
+       public String getPathName(String page) {
+               String http = new String("http://");
+               String https = new String("https://");
+               int beginindex;
+               int nextindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+                       String path = getPath();
+                       int lastindex = path.lastindexOf('/');
+                       if (lastindex == -1)
+                               return page;
+           
+                       StringBuffer sb = new StringBuffer(path.subString(0,lastindex+1));
+                       sb.append(page);
+                       return sb.toString();
+               }
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
+               }
+               nextindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex==-1) || (nextindex==-1))
+                       return new String("index.html");
+               return page.subString(nextindex+1, page.length());
+       }
+}
diff --git a/QueryList.java b/QueryList.java
new file mode 100644 (file)
index 0000000..d09167b
--- /dev/null
@@ -0,0 +1,22 @@
+public class QueryList extends Queue {
+       Queue queries;
+
+  public QueryList() {
+               queries = global new Queue();
+  }
+
+  public boolean checkQuery(GlobalString x) {
+               boolean set = false;;
+               for (int i = 0 ; i < size; i++) {
+                       if (x.equals((GlobalString)elements[i])) {
+                               set = true;
+                               break;
+                       }
+               }
+               return set;
+  }
+
+       public void addQuery(GlobalString x) {
+               queries.push(x);
+       }
+}
diff --git a/QueryQueue.java b/QueryQueue.java
new file mode 100644 (file)
index 0000000..915bb4b
--- /dev/null
@@ -0,0 +1,34 @@
+public class QueryQueue {
+       HashSet queries;
+       int size;
+
+       public QueryQueue() {
+               queries = new HashSet();
+               size = 0;
+       }
+
+       public LocalQuery pop() {
+               if (queries.isEmpty())
+                       return null;
+               LocalQuery q = (LocalQuery) queries.iterator().next();
+               queries.remove(q);
+               size--;
+               return q;
+       }
+
+       public void push(LocalQuery x) {
+               queries.add(x);
+               size++;
+       }
+       
+       public int size() {
+               return size;
+       }
+
+       public boolean isEmpty() {
+               if (size == 0)
+                       return true;
+               else 
+                       return false;
+       }
+}
diff --git a/QueryTask.java b/QueryTask.java
new file mode 100644 (file)
index 0000000..e333984
--- /dev/null
@@ -0,0 +1,257 @@
+public class QueryTask extends Task {
+       int maxDepth;
+       Queue toprocess;
+       DistributedHashMap results;
+       GlobalString workingURL;
+
+  public QueryTask(Queue todoList, DistributedHashMap doneList, int maxDepth, DistributedHashMap results) {
+    this.todoList = todoList;
+               this.doneList = doneList;
+               this.maxDepth = maxDepth;
+               this.results = results;
+  }
+
+  public void execute() {
+               int depth;
+               int max;
+               
+               atomic {
+                       depth = ((GlobalQuery)myWork).getDepth();
+      max = this.maxDepth;
+               }
+
+               if (depth < max) {
+                       /* global variables */
+                       GlobalQuery gq;
+
+                       /* local variables */
+                       LocalQuery lq;
+                       String hostname;
+                       String path;
+
+                       atomic {
+                               gq = (GlobalQuery)myWork;
+                               hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
+                               path = new String(GlobalString.toLocalCharArray(gq.getPath()));
+
+                               GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+                               gsb.append("/");
+                               gsb.append(path);
+                               workingURL = global new GlobalString(gsb.toGlobalString());
+                       }
+                       lq = new LocalQuery(hostname, path, depth);
+
+                       System.printString(lq.getDepth()+" ");
+                       System.printString("Processing - Hostname : ");
+                       System.printString(hostname);
+                       System.printString(", Path : ");
+                       System.printString(path);
+                       System.printString("\n");
+
+                       Socket s = new Socket(hostname, 80);
+    
+                       requestQuery(hostname, path, s);
+                       readResponse(lq, s);
+
+                       atomic {
+                               processList(lq, workingURL, results);
+                       }
+
+                       atomic {
+                               toprocess = processPage(lq);
+                       }
+
+                       s.close();
+               }
+  }
+
+       public void done(Object obj) {
+               GlobalString str = global new GlobalString("true");
+               doneList.put(workingURL, str);
+
+               while(!toprocess.isEmpty()) {
+                       GlobalQuery q = (GlobalQuery)toprocess.pop();
+
+                       GlobalString hostname = global new GlobalString(q.getHostName());
+                       GlobalString path = global new GlobalString(q.getPath());
+
+                       GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+                       gsb.append("/");
+                       gsb.append(path);
+
+                       if (!doneList.containsKey(gsb.toGlobalString())) {
+                               todoList.push(q);
+                       }
+               }
+       }
+
+       public static void requestQuery(String hostname, String path, Socket sock) {
+    StringBuffer req = new StringBuffer("GET "); 
+    req.append("/");
+               req.append(path);
+    req.append(" HTTP/1.1\r\nHost:");
+    req.append(hostname);
+    req.append("\r\n\r\n");
+    sock.write(req.toString().getBytes());
+  }
+
+       public static void readResponse(LocalQuery lq, Socket sock) {
+       //    state 0 - nothing
+       //    state 1 - \r
+       //    state 2 - \r\n
+       //    state 3 - \r\n\r
+       //    state 4 - \r\n\r\n
+    int state=0;
+    while(true) {
+      if (state<4) {
+        if (state==0) {
+          byte[] b=new byte[1];
+          int numchars=sock.read(b);
+          if ((numchars==1)) {
+            if (b[0]=='\r') {
+              state++;
+            }
+          } else
+                                               return;
+        } else if (state==1) {
+          byte[] b=new byte[1];
+          int numchars=sock.read(b);
+          if (numchars==1) {
+            if (b[0]=='\n')
+              state++;
+            else
+              state=0;
+          } else return;
+        } else if (state==2) {
+          byte[] b=new byte[1];
+          int numchars=sock.read(b);
+          if (numchars==1) {
+            if (b[0]=='\r')
+              state++;
+            else
+              state=0;
+          } else return;
+        } else if (state==3) {
+          byte[] b=new byte[1];
+          int numchars=sock.read(b);
+          if (numchars==1) {
+            if (b[0]=='\n')
+              state++;
+            else
+              state=0;
+          } else return;
+        }
+      } else {
+                               byte[] buffer=new byte[1024];
+        int numchars=sock.read(buffer);
+        if (numchars==0)
+          return;
+        else {
+          String curr=(new String(buffer)).subString(0,numchars);
+                                       lq.response.append(curr);
+        }
+      }
+    }
+  }
+
+       public static void processList(LocalQuery lq, GlobalString url, DistributedHashMap results) {
+               String sTitle = new String("<title>");  
+               String eTitle = new String("</title>");
+               String searchstr = lq.response.toString();
+               LinkedList ll;
+
+               int sIndex = searchstr.indexOf(sTitle);
+               if (sIndex != -1) {
+                       int eIndex = searchstr.indexOf(eTitle, sIndex+sTitle.length());
+                       String title = new String(searchstr.subString(sIndex+sTitle.length(), eIndex));
+                       ll = tokenize(title);
+
+                       Queue q;
+                       while (!ll.isEmpty()) {
+                               GlobalString word = global new GlobalString(ll.pop().toString());
+//                             q = (Queue)(results.get(word));
+
+//                             if (q == null) {
+                               if (!results.containsKey(word)) {
+                                       q = global new Queue();
+                               }
+                               else {
+                                       q = (Queue)(results.get(word));
+                               }
+                               q.push(url);
+                               results.put(word, q);
+
+                               System.out.println("Key : ["+word.toLocalString()+"],["+q.size()+"]");
+/*
+                               for (int i = 0; i < q.size(); i++) {
+                                       Object obj = q.elements[i];
+                                       GlobalString str = global new GlobalString((GlobalString)obj);
+                                       System.out.println("\t["+i+"] : "+str.toLocalString());
+                               }*/
+                       }
+               }
+       }
+
+       public static LinkedList tokenize(String str) {
+               LinkedList ll;
+               int sIndex = 0;
+               int eIndex = 0;
+               String token;
+
+               ll = new LinkedList();
+               
+               // and, or, of, at, but, '.', ',', ':' ';', '"', ' ', '-', '='
+               while (true) {
+                       eIndex = str.indexOf(' ', sIndex);
+                       if (eIndex == -1) {
+                               token = str.subString(sIndex);
+                               ll.add(token);
+                               break;
+                       }
+                       else {
+                               token = str.subString(sIndex, eIndex);
+                               ll.add(token);
+                               sIndex = eIndex+1;
+                       }
+               }
+               
+               return ll;
+       }
+       
+  public static Queue processPage(LocalQuery lq) {
+    int index = 0;
+       String href = new String("href=\"");
+       String searchstr = lq.response.toString();
+               int depth;
+       boolean cont = true;
+               Queue toprocess;
+
+               depth = lq.getDepth() + 1;
+
+               toprocess = global new Queue();
+
+               while(cont) {
+                       int mindex = searchstr.indexOf(href,index);
+                       if (mindex != -1) {     
+                               int endquote = searchstr.indexOf('"', mindex+href.length());
+               if (endquote != -1) {
+                     String match = searchstr.subString(mindex+href.length(), endquote);
+                                       String match2 = lq.makewebcanonical(match);
+       
+                                       GlobalString ghostname;
+                                       GlobalString gpath;
+
+                                       ghostname = global new GlobalString(lq.getHostName(match));
+                                       gpath = global new GlobalString(lq.getPathName(match));
+
+                     if (match2 != null) {
+                                                       GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
+                                                       toprocess.push(gq);
+                                       }
+                                       index = endquote;
+        } else cont = false;
+      } else cont = false;
+    }
+               return toprocess;
+  }
+}
diff --git a/Spider.java b/Spider.java
new file mode 100644 (file)
index 0000000..9335ef3
--- /dev/null
@@ -0,0 +1,60 @@
+public class Spider {
+       public static void main(String[] args) {
+               int NUM_THREADS = 3;
+               int maxDepth = 3;
+               int i, j;
+               Work[] works;
+               QueryTask[] qt;
+               GlobalQuery[] currentWorkList;
+
+               NUM_THREADS = Integer.parseInt(args[0]);
+
+               if (args.length == 3) {
+                       maxDepth = Integer.parseInt(args[2]);
+               }
+
+               GlobalString firstmachine;
+
+               int mid[] = new int[NUM_THREADS];
+               mid[0] = (128<<24)|(195<<16)|(180<<8)|21;        
+               mid[1] = (128<<24)|(195<<16)|(180<<8)|24;        
+               mid[2] = (128<<24)|(195<<16)|(180<<8)|26;        
+
+               atomic {
+                       firstmachine = global new GlobalString(args[1]);
+
+                       works = global new Work[NUM_THREADS];
+                       qt = global new QueryTask[NUM_THREADS];
+                       currentWorkList = global new GlobalQuery[NUM_THREADS];
+                       
+                       GlobalQuery firstquery = global new GlobalQuery(firstmachine);
+
+                       Queue todoList = global new Queue();
+                       DistributedHashMap doneList = global new DistributedHashMap(500, 500, 0.75f);
+                       DistributedHashMap results = global new DistributedHashMap(100, 100, 0.75f);
+                       
+                       todoList.push(firstquery);
+
+                       for (i = 0; i < NUM_THREADS; i++) {
+                               qt[i] = global new QueryTask(todoList, doneList, maxDepth, results);
+                               works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList);
+                       }
+               }
+               System.printString("Finished to create Objects\n");
+
+               Work tmp;
+               for (i = 0; i < NUM_THREADS; i++) {
+                       atomic {
+                               tmp = works[i];
+                       }
+                       Thread.myStart(tmp, mid[i]);
+               }
+
+               for (i = 0; i < NUM_THREADS; i++) {
+                       atomic {
+                               tmp = works[i];
+                       }
+                       tmp.join();
+               }
+       }
+}
diff --git a/dstm.conf b/dstm.conf
new file mode 100644 (file)
index 0000000..935ef31
--- /dev/null
+++ b/dstm.conf
@@ -0,0 +1,3 @@
+128.195.180.21
+128.195.180.24
+128.195.180.26