rewrite
authorbdemsky <bdemsky>
Thu, 8 Apr 2010 00:44:32 +0000 (00:44 +0000)
committerbdemsky <bdemsky>
Thu, 8 Apr 2010 00:44:32 +0000 (00:44 +0000)
Robust/src/Benchmarks/Recovery/Spider/recovery/QueryTask.java
Robust/src/Benchmarks/Recovery/Spider/recovery/Spider.java
Robust/src/Benchmarks/Recovery/Spider/recovery/makefile

index 5487849200b467ac6a0b4ebe86475f2f1b8df9be..8346b6e25dc441ecf78b0610ed82aeacfa4f0bb2 100644 (file)
 public class QueryTask extends Task {
-       int maxDepth;
-       int maxSearchDepth;
-       GlobalQueue toprocess;
-       DistributedHashMap results;
-       DistributedLinkedList results_list;
-       DistributedHashMap visitedList;
-       GlobalString gTitle;
-       GlobalString workingURL;
-
-  public QueryTask(GlobalQueue todoList, DistributedHashMap visitedList, int maxDepth, int maxSearchDepth, DistributedHashMap results, DistributedLinkedList results_list) {
-    this.todoList = todoList;
-               this.visitedList = visitedList;
-               this.maxDepth = maxDepth;
-               this.maxSearchDepth = maxSearchDepth;
-               this.results = results;
-               this.results_list = results_list;
-               toprocess = global new GlobalQueue();
-  }
-
-  public void execute() {
-               int depth;
-               int max;
-               int maxSearch;
-               
-               atomic {
-                       depth = ((GlobalQuery)myWork).getDepth();
-            max = this.maxDepth;
-                       maxSearch = this.maxSearchDepth;
-               }
-
-               if (depth < max) {
-                       /* global variables */
-                       GlobalQuery gq;
-
-                       /* local variables */
-                       LocalQuery lq;
-                       String hostname;
-                       String path;
-                       String title;
-
-                       atomic {
-                               gq = (GlobalQuery)myWork;
-                               hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
-                               path = new String(GlobalString.toLocalCharArray(gq.getPath()));
-
-                               GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
-                               gsb.append("/");
-                               gsb.append(path);
-                               workingURL = global new GlobalString(gsb.toGlobalString());
-                               gTitle = null;
-                       }
-                       lq = new LocalQuery(hostname, path, depth);
-
-/*                     System.printString("["+lq.getDepth()+"] ");
-                       System.printString("Processing - Hostname : ");
-                       System.printString(hostname);
-                       System.printString(", Path : ");
-                       System.printString(path);
-                       System.printString("\n");
-*/
-                       if (isDocument(path)) {
-                               return;
-                       }
-
-                       Socket s = new Socket();
-
-                       if(s.connect(hostname, 80) == -1) {
-                               return;
-                       }
-
-                       requestQuery(hostname, path, s);
-                       readResponse(lq, s);
-
-                       if ((title = grabTitle(lq)) != null) {
-                               atomic {
-                                       gTitle = global new GlobalString(title);
-                               }
-                               atomic {
-                                       toprocess = processPage(lq);
-                               }
-                       }
-                       s.close();
-               }
+  int maxDepth;
+  int maxSearchDepth;
+  DistributedHashMap results;
+  DistributedLinkedList results_list;
+  DistributedHashMap visitedList;
+  GlobalString gTitle;
+  GlobalString workingURL;
+  GlobalString hostname;
+  GlobalString path;
+  int depth;
+
+  public QueryTask(DistributedHashMap visitedList, int maxDepth, int maxSearchDepth, DistributedHashMap results, DistributedLinkedList results_list, GlobalString hostname, GlobalString path, int depth) {
+    this.hostname=hostname;
+    this.path=path;
+    this.depth=depth;
+    this.visitedList = visitedList;
+    this.maxDepth = maxDepth;
+    this.maxSearchDepth = maxSearchDepth;
+    this.results = results;
+    this.results_list = results_list;
   }
   
+  public void execute() {
+    int max;
+    int maxSearch;
+    int ldepth;
+
+    atomic {
+      max = this.maxDepth;
+      maxSearch = this.maxSearchDepth;
+      ldepth=this.depth;
+    }
+    
+    if (ldepth < max) {
+      /* local variables */
+      String hostname;
+      String path;
+      String title;
+      
+      atomic {
+       hostname = new String(GlobalString.toLocalCharArray(getHostName()));
+       path = new String(GlobalString.toLocalCharArray(getPath()));
        
-       public static boolean isDocument(String str) {
-               int index = str.lastindexOf('.');
-
-               if (index != -1) {
-                       if ((str.subString(index+1)).equals("pdf")) return true;
-                       else if ((str.subString(index+1)).equals("ps")) return true;
-                       else if ((str.subString(index+1)).equals("ppt")) return true;
-                       else if ((str.subString(index+1)).equals("pptx")) return true;
-                       else if ((str.subString(index+1)).equals("jpg")) return true;
-                       else if ((str.subString(index+1)).equals("mp3")) return true;
-                       else if ((str.subString(index+1)).equals("wmv")) return true;
-                       else if ((str.subString(index+1)).equals("doc")) return true;
-                       else if ((str.subString(index+1)).equals("docx")) return true;
-                       else if ((str.subString(index+1)).equals("mov")) return true;
-                       else if ((str.subString(index+1)).equals("flv")) return true;
-                       else if ((str.subString(index+1)).equals("tar")) return true;
-                       else if ((str.subString(index+1)).equals("tgz")) return true;
-                       else return false;
-               }
-               return false;
-       }
-
-       public void done(Object obj) {
-               if ((gTitle != null) && (gTitle.length() > 0)) {
-                       processedList();
-               }
-
-               int searchCnt = 0;
-               while(!toprocess.isEmpty()) {
-                       GlobalQuery q = (GlobalQuery)toprocess.pop();
-
-                       GlobalString hostname = global new GlobalString(q.getHostName());
-                       GlobalString path = global new GlobalString(q.getPath());
-
-                       GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
-                       gsb.append("/");
-                       gsb.append(path);
-
-                       if (!visitedList.containsKey(gsb.toGlobalString()) && (searchCnt < maxSearchDepth)) {
-                               todoList.push(q);
-                                       
-                               GlobalString str = global new GlobalString("1");
-                               visitedList.put(gsb.toGlobalString(), str);
-                               results_list.add(gsb.toGlobalString());
-                               searchCnt++;
-                       }
-               }
+       GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+       gsb.append("/");
+       gsb.append(path);
+       workingURL = global new GlobalString(gsb.toGlobalString());
+       gTitle = null;
+      }
+      LocalQuery lq = new LocalQuery(hostname, path, ldepth);
+
+      if (isDocument(path)) {
+       return;
+      }
+      
+      Socket s = new Socket();
+
+      if(s.connect(hostname, 80) == -1) {
+       return;
+      }
+      
+      requestQuery(hostname, path, s);
+      readResponse(lq, s);
+      
+      if ((title = grabTitle(lq)) != null) {
+       atomic {
+         //commits everything...either works or fails
+         gTitle = global new GlobalString(title);
+         processPage(lq);
+         dequeueTask();
        }
+      }
+      s.close();
+    } else {
+      atomic {
+       dequeueTask();
+      }
+    }
+  }
+  
+  public int getDepth() {
+    return depth;
+  }
+  
+  public GlobalString getHostName() {
+    return hostname;
+  }
+  
+  public GlobalString getPath() {
+    return path;
+  }
 
-       public void output() {
-               String str;
-               Iterator iter = results_list.iterator();
-
+  public GlobalString makewebcanonical(GlobalString page) {
+    GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page));
+    b.append("/");
+    b.append(getPathName(page));
+    return b.toGlobalString();
+  }
+  
+  public GlobalString getHostName(GlobalString page) {
+    GlobalString http = global new GlobalString("http://");
+    GlobalString https = global new GlobalString("https://");
+    int beginindex;
+    int endindex;
+    
+    if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+      return getHostName();
+    } else if (page.indexOf(https) != -1) {
+      beginindex = page.indexOf(https) + https.length();
+    } else {
+      beginindex = page.indexOf(http) + http.length();
+    }
+    endindex = page.indexOf('/',beginindex+1);
+    
+    if ((beginindex == -1)) {  
+      System.printString("ERROR");
+    }
+    if (endindex == -1)
+      endindex = page.length();
+    
+    return page.subString(beginindex, endindex);
+  }
+  
+  
+  public GlobalString getPathName(GlobalString page) {
+    GlobalString http = global new GlobalString("http://");
+    GlobalString https = global new GlobalString("https://");
+    int beginindex;
+    int nextindex;
+    
+    if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
+      GlobalString path = getPath();
+      int lastindex = path.lastindexOf('/');
+      if (lastindex == -1)
+        return page;
+      
+      GlobalStringBuffer sb = global new GlobalStringBuffer(path.subString(0,lastindex+1));
+      sb.append(page);
+      return sb.toGlobalString();
+    } else if (page.indexOf(https) != -1) {
+      beginindex = page.indexOf(https) + https.length();
+    } else {
+      beginindex = page.indexOf(http) + http.length();
+    }
+    nextindex = page.indexOf('/',beginindex+1);
+    
+    if ((beginindex == -1) || (nextindex == -1))
+      return global new GlobalString("index.html");
+    return page.subString(nextindex+1, page.length());
+  }
+  
+  public static boolean isDocument(String str) {
+    int index = str.lastindexOf('.');
+    
+    if (index != -1) {
+      if ((str.subString(index+1)).equals("pdf")) return true;
+      else if ((str.subString(index+1)).equals("ps")) return true;
+      else if ((str.subString(index+1)).equals("ppt")) return true;
+      else if ((str.subString(index+1)).equals("pptx")) return true;
+      else if ((str.subString(index+1)).equals("jpg")) return true;
+      else if ((str.subString(index+1)).equals("mp3")) return true;
+      else if ((str.subString(index+1)).equals("wmv")) return true;
+      else if ((str.subString(index+1)).equals("doc")) return true;
+      else if ((str.subString(index+1)).equals("docx")) return true;
+      else if ((str.subString(index+1)).equals("mov")) return true;
+      else if ((str.subString(index+1)).equals("flv")) return true;
+      else if ((str.subString(index+1)).equals("tar")) return true;
+      else if ((str.subString(index+1)).equals("tgz")) return true;
+      else return false;
+    }
+    return false;
+  }
+  
+  public void output() {
+    String str;
+    Iterator iter = results_list.iterator();
+    
     System.out.println("Size = " + results_list.size());
+  }
 
-/*             while (iter.hasNext() == true) {
-                       str = ((GlobalString)(iter.next())).toLocalString();
-                       System.printString(str + "\n");
-               }*/
-       }
-
-       public static String grabTitle(LocalQuery lq) {
-               String sBrace = new String("<");        
-               String strTitle = new String("title>");
-       String searchstr = lq.response.toString();
-               String title = null;
-               char ch;
-
-               int mindex = -1;
-               int endquote = -1;
-               int i, j;
-               String tmp;
-
-               for (i = 0; i < searchstr.length(); i++) {
-                       if (searchstr.charAt(i) == '<') {
-                               i++;
-                               if (searchstr.length() > (i+strTitle.length())) {
-                                       tmp = searchstr.subString(i, i+strTitle.length());
-                                       if (tmp.equalsIgnoreCase("title>")) {
-                                               mindex = i + tmp.length();
-                                               for (j = mindex; j < searchstr.length(); j++) {
-                                                       if (searchstr.charAt(j) == '<') {
-                                                               j++;
-                                                               tmp = searchstr.subString(j, j+strTitle.length()+1);                    
-                                                               if (tmp.equalsIgnoreCase("/title>")) {
-                                                                       endquote = j - 1;
-                                                                       break;
-                                                               }
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-               }
-
-               if (mindex != -1) {
-                       title = searchstr.subString(mindex, endquote);
-                       if (Character.isWhitespace(title.charAt(0))){
-                               mindex=0;
-                               while (Character.isWhitespace(title.charAt(mindex++)));
-                               mindex--;
-                               if (mindex >= title.length()) return null;
-                               title = new String(title.subString(mindex));
-                       }
-
-                       if (Character.isWhitespace(title.charAt(title.length()-1))) {
-                               endquote=title.length()-1;
-                               while (Character.isWhitespace(title.charAt(endquote--)));
-                               endquote += 2;
-                               if (mindex >= endquote) return null;
-                               title = new String(title.subString(0, endquote));
-                       }
-
-                       if (isErrorPage(title)) {
-                               return null;
-                       }
+  public static String grabTitle(LocalQuery lq) {
+    String sBrace = new String("<");   
+    String strTitle = new String("title>");
+    String searchstr = lq.response.toString();
+    String title = null;
+    char ch;
+    
+    int mindex = -1;
+    int endquote = -1;
+    int i, j;
+    String tmp;
+    
+    for (i = 0; i < searchstr.length(); i++) {
+      if (searchstr.charAt(i) == '<') {
+       i++;
+       if (searchstr.length() > (i+strTitle.length())) {
+         tmp = searchstr.subString(i, i+strTitle.length());
+         if (tmp.equalsIgnoreCase("title>")) {
+           mindex = i + tmp.length();
+           for (j = mindex; j < searchstr.length(); j++) {
+             if (searchstr.charAt(j) == '<') {
+               j++;
+               tmp = searchstr.subString(j, j+strTitle.length()+1);                    
+               if (tmp.equalsIgnoreCase("/title>")) {
+                 endquote = j - 1;
+                 break;
                }
-
-               return title;
-       }
-
-       public static boolean isErrorPage(String str) { 
-               if (str.equals("301 Moved Permanently")) 
-                       return true;
-               else if (str.equals("302 Found")) 
-                       return true;
-               else if (str.equals("404 Not Found")) 
-                       return true;
-               else if (str.equals("403 Forbidden")) 
-                       return true;
-               else if (str.equals("404 File Not Found")) 
-                       return true;
-               else
-                       return false;
+             }
+           }
+         }
        }
+      }
+    }
+    
+    if (mindex != -1) {
+      title = searchstr.subString(mindex, endquote);
+      if (Character.isWhitespace(title.charAt(0))){
+       mindex=0;
+       while (Character.isWhitespace(title.charAt(mindex++)));
+       mindex--;
+       if (mindex >= title.length()) return null;
+       title = new String(title.subString(mindex));
+      }
+      
+      if (Character.isWhitespace(title.charAt(title.length()-1))) {
+       endquote=title.length()-1;
+       while (Character.isWhitespace(title.charAt(endquote--)));
+       endquote += 2;
+       if (mindex >= endquote) return null;
+       title = new String(title.subString(0, endquote));
+      }
+      
+      if (isErrorPage(title)) {
+       return null;
+      }
+    }
+    
+    return title;
+  }
 
-       public static void requestQuery(String hostname, String path, Socket sock) {
+  public static boolean isErrorPage(String str) {      
+    if (str.equals("301 Moved Permanently")) 
+      return true;
+    else if (str.equals("302 Found")) 
+      return true;
+    else if (str.equals("404 Not Found")) 
+      return true;
+    else if (str.equals("403 Forbidden")) 
+      return true;
+    else if (str.equals("404 File Not Found")) 
+      return true;
+    else
+      return false;
+  }
+  
+  public static void requestQuery(String hostname, String path, Socket sock) {
     StringBuffer req = new StringBuffer("GET "); 
     req.append("/");
-               req.append(path);
-         req.append(" HTTP/1.0\r\nHost: ");
+    req.append(path);
+    req.append(" HTTP/1.0\r\nHost: ");
     req.append(hostname);
     req.append("\r\n\r\n");
     sock.write(req.toString().getBytes());
   }
-
-       public static void readResponse(LocalQuery lq, Socket sock) {
-       //    state 0 - nothing
-       //    state 1 - \r
-       //    state 2 - \r\n
-       //    state 3 - \r\n\r
-       //    state 4 - \r\n\r\n
-               byte[] buffer = new byte[1024];
-               int numchars;
-
-               do {
-                       numchars = sock.read(buffer);
-
-                       String curr = (new String(buffer)).subString(0, numchars);
-                       
-                       lq.response.append(curr);
-                       buffer = new byte[1024];
-               } while(numchars > 0);
+  
+  public static void readResponse(LocalQuery lq, Socket sock) {
+    //    state 0 - nothing
+    //    state 1 - \r
+    //    state 2 - \r\n
+    //    state 3 - \r\n\r
+    //    state 4 - \r\n\r\n
+    byte[] buffer = new byte[1024];
+    int numchars;
+    
+    do {
+      numchars = sock.read(buffer);
+      
+      String curr = (new String(buffer)).subString(0, numchars);
+      
+      lq.response.append(curr);
+      buffer = new byte[1024];
+    } while(numchars > 0);
   }
 
-       public void processedList() {
-               LinkedList ll;
-               GlobalString token = null;
-               int mindex = 0;
-               int endquote = 0;
-
-               while (endquote != -1) {
-                       endquote = gTitle.indexOf(' ', mindex);
-
-                       if (endquote != -1) {
-                               token = gTitle.subString(mindex, endquote);
-                               mindex = endquote + 1;
-                               if (filter(token)) {
-                                       continue;
-                               }
-                               token = refine(token);
-                       }
-                       else {
-                               token = gTitle.subString(mindex);
-                               token = refine(token);
-                       }
-
-                       GlobalQueue q = (GlobalQueue)results.get(token);
-                       if (q == null) {
-                               q = global new GlobalQueue();
-                       }
-                       q.push(workingURL);     
-                       results.put(token, q);
-               }
-       }
-
-       public boolean filter(GlobalString str) {
-               if (str.equals("of"))   return true;
-               else if (str.equals("for")) return true;
-               else if (str.equals("a")) return true;
-               else if (str.equals("an")) return true;
-               else if (str.equals("the")) return true;
-               else if (str.equals("at")) return true;
-               else if (str.equals("and")) return true;
-               else if (str.equals("or")) return true;
-               else if (str.equals("but")) return true;
-               else if (str.equals("to")) return true;
-               else if (str.equals("The")) return true;
-               else if (str.length() == 1) {
-                       if (str.charAt(0) == '.') return true;
-                       else if (str.charAt(0) == '.') return true;
-                       else if (str.charAt(0) == '-') return true;
-                       else if (str.charAt(0) == '=') return true;
-                       else if (str.charAt(0) == '_') return true;
-                       else if (str.charAt(0) == ':') return true;
-                       else if (str.charAt(0) == ';') return true;
-                       else if (str.charAt(0) == '\'') return true;
-                       else if (str.charAt(0) == '\"') return true;
-                       else if (str.charAt(0) == '|') return true;
-                       else if (str.charAt(0) == '@') return true;
-                       else if (str.charAt(0) == '&') return true;
-                       else if (str.charAt(0) == ' ') return true;
-               }
-               else return false;
-       }
-
-       public GlobalString refine(GlobalString str) {
-               str = refinePrefix(str);
-               str = refinePostfix(str);
-               return str;
+  public void processedList() {
+    LinkedList ll;
+    GlobalString token = null;
+    int mindex = 0;
+    int endquote = 0;
+    
+    while (endquote != -1) {
+      endquote = gTitle.indexOf(' ', mindex);
+      
+      if (endquote != -1) {
+       token = gTitle.subString(mindex, endquote);
+       mindex = endquote + 1;
+       if (filter(token)) {
+         continue;
        }
+       token = refine(token);
+      } else {
+       token = gTitle.subString(mindex);
+       token = refine(token);
+      }
+      
+      GlobalQueue q = (GlobalQueue)results.get(token);
+      if (q == null) {
+       q = global new GlobalQueue();
+      }
+      q.push(workingURL);      
+      results.put(token, q);
+    }
+  }
 
-       public GlobalString refinePrefix(GlobalString str) {
-               if (str.charAt(0) == '&') {             // &
-                       return str.subString(1);
-               }
-               else if (str.charAt(0) == '/') {                // &
-                       return str.subString(1);
-               }
-               return str;
-       }
+  public boolean filter(GlobalString str) {
+    if (str.equals("of"))      return true;
+    else if (str.equals("for")) return true;
+    else if (str.equals("a")) return true;
+    else if (str.equals("an")) return true;
+    else if (str.equals("the")) return true;
+    else if (str.equals("at")) return true;
+    else if (str.equals("and")) return true;
+    else if (str.equals("or")) return true;
+    else if (str.equals("but")) return true;
+    else if (str.equals("to")) return true;
+    else if (str.equals("The")) return true;
+    else if (str.length() == 1) {
+      if (str.charAt(0) == '.') return true;
+      else if (str.charAt(0) == '.') return true;
+      else if (str.charAt(0) == '-') return true;
+      else if (str.charAt(0) == '=') return true;
+      else if (str.charAt(0) == '_') return true;
+      else if (str.charAt(0) == ':') return true;
+      else if (str.charAt(0) == ';') return true;
+      else if (str.charAt(0) == '\'') return true;
+      else if (str.charAt(0) == '\"') return true;
+      else if (str.charAt(0) == '|') return true;
+      else if (str.charAt(0) == '@') return true;
+      else if (str.charAt(0) == '&') return true;
+      else if (str.charAt(0) == ' ') return true;
+    }
+    else return false;
+  }
+  
+  public GlobalString refine(GlobalString str) {
+    str = refinePrefix(str);
+    str = refinePostfix(str);
+    return str;
+  }
+  
+  public GlobalString refinePrefix(GlobalString str) {
+    if (str.charAt(0) == '&') {                // &
+      return str.subString(1);
+    } else if (str.charAt(0) == '/') {         // &
+      return str.subString(1);
+    }
+    return str;
+  }
+  
+  public GlobalString refinePostfix(GlobalString str) {
+    if (str.charAt(str.length()-1) == ',') {                   // ,
+      return str.subString(0, str.length()-1);
+    } else if (str.charAt(str.length()-1) == ':') {            // :
+      return str.subString(0, str.length()-1);
+    } else if (str.charAt(str.length()-1) == ';') {            // ;
+      return str.subString(0, str.length()-1);
+    } else if (str.charAt(str.length()-1) == '!') {            // !
+      return str.subString(0, str.length()-1);
+    } else if (str.charAt(str.length()-1) == 's') {                    // 's
+      if (str.charAt(str.length()-2) == '\'')
+       return str.subString(0, str.length()-2);        
+    } else if (str.charAt(str.length()-1) == '-') {
+      int index = str.length()-2;
+      while (Character.isWhitespace(str.charAt(index--)));
+      return str.subString(0, index+2);
+    }
+    return str;
+  }
+  
+  
+  public void processPage(LocalQuery lq) {
+    if ((gTitle != null) && (gTitle.length() > 0)) {
+      processedList();
+    }
 
-       public GlobalString refinePostfix(GlobalString str) {
-               if (str.charAt(str.length()-1) == ',') {                        // ,
-                       return str.subString(0, str.length()-1);
-               }
-               else if (str.charAt(str.length()-1) == ':') {           // :
-                       return str.subString(0, str.length()-1);
-               }
-               else if (str.charAt(str.length()-1) == ';') {           // ;
-                       return str.subString(0, str.length()-1);
-               }
-               else if (str.charAt(str.length()-1) == '!') {           // !
-                       return str.subString(0, str.length()-1);
-               }
-               else if (str.charAt(str.length()-1) == 's') {                   // 's
-                       if (str.charAt(str.length()-2) == '\'')
-                               return str.subString(0, str.length()-2);        
-               }
-               else if (str.charAt(str.length()-1) == '-') {
-                       int index = str.length()-2;
-                       while (Character.isWhitespace(str.charAt(index--)));
-                       return str.subString(0, index+2);
-               }
-               return str;
-       }
-       
-  public static GlobalQueue processPage(LocalQuery lq) {
     int index = 0;
-       String href = new String("href=\"");
-       String searchstr = lq.response.toString();
-               int depth;
-       boolean cont = true;
-               GlobalQueue toprocess;
-
-               depth = lq.getDepth() + 1;
-
-               toprocess = global new GlobalQueue();
-               while(cont) {
-                       int mindex = searchstr.indexOf(href,index);
-                       if (mindex != -1) {     
-                               int endquote = searchstr.indexOf('"', mindex+href.length());
-               if (endquote != -1) {
-                     String match = searchstr.subString(mindex+href.length(), endquote);
-                                       String match2 = lq.makewebcanonical(match);
-       
-                                       GlobalString ghostname;
-                                       GlobalString gpath;
-
-                                       ghostname = global new GlobalString(lq.getHostName(match));
-                                       gpath = global new GlobalString(lq.getPathName(match));
-
-                     if (match2 != null) {
-                                                       GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
-                                                       toprocess.push(gq);
-                                       }
-                                       index = endquote;
-        } else cont = false;
-      } else cont = false;
+    String href = new String("href=\"");
+    String searchstr = lq.response.toString();
+    int searchCnt = 0;    
+    while(true) {
+      int mindex = searchstr.indexOf(href,index);
+      if (mindex != -1) {      
+       int endquote = searchstr.indexOf('"', mindex+href.length());
+       if (endquote != -1) {
+         String match = searchstr.subString(mindex+href.length(), endquote);
+         String match2 = lq.makewebcanonical(match);
+         
+         GlobalString ghostname;
+         GlobalString gpath;
+         
+         ghostname = global new GlobalString(lq.getHostName(match));
+         gpath = global new GlobalString(lq.getPathName(match));
+         
+         GlobalStringBuffer gsb = global new GlobalStringBuffer(ghostname);
+         gsb.append("/");
+         gsb.append(gpath);
+
+         if (match2 != null) {
+           if (!visitedList.containsKey(gsb.toGlobalString()) && (searchCnt < maxSearchDepth)) {
+             GlobalString str = global new GlobalString("1");
+             visitedList.put(gsb.toGlobalString(), str);
+             results_list.add(gsb.toGlobalString());
+             searchCnt++;
+             QueryTask gq = global new QueryTask(visitedList, maxDepth, maxSearchDepth, results, results_list, ghostname, gpath, lq.getDepth()+1);
+             enqueueTask(gq);
+           }
+         }
+         index = endquote;
+        } else 
+         break;
+      } else 
+       break;
     }
-               return toprocess;
   }
 }
index 9ba0cd3e860cb503e8b2706af7c0979bb33dcf7c..0f82b51b320f9307fb157a44d7ab1f69d65416fb 100644 (file)
@@ -5,85 +5,68 @@ Usage :
 
 
 public class Spider {
-       public static void main(String[] args) {
-               int NUM_THREADS = 3;
-               int maxDepth = 3;
-               int maxSearchDepth = 10;
-               int i, j;
-               Work[] works;
-               QueryTask[] qt;
-               GlobalQuery[] currentWorkList;
+  public static void main(String[] args) {
+    int NUM_THREADS = 3;
+    int maxDepth = 3;
+    int maxSearchDepth = 10;
+    int i, j;
     String fm = "www.uci.edu";
     String fp = "";
-
+    
     if(args.length != 3) {
       System.out.println("./Spider.java master <num_thread> <first machine> <maxDepth>");
       System.exit(0);
-    }
-    else {
+    } else {
       NUM_THREADS = Integer.parseInt(args[0]);
       fm = args[1];
       maxDepth = Integer.parseInt(args[2]);
     }
+    
+    int mid[] = new int[8];
 
-               GlobalString firstmachine;
-               GlobalString firstpage;
-
-               int mid[] = new int[8];
-
-    /*
-               mid[0] = (128<<24)|(195<<16)|(180<<8)|21;
-               mid[1] = (128<<24)|(195<<16)|(180<<8)|26;
-               mid[2] = (128<<24)|(195<<16)|(180<<8)|24;
-  */
-               mid[0] = (128<<24)|(195<<16)|(136<<8)|162;
-               mid[1] = (128<<24)|(195<<16)|(136<<8)|163;
-               mid[2] = (128<<24)|(195<<16)|(136<<8)|164;
-               mid[3] = (128<<24)|(195<<16)|(136<<8)|165;
-               mid[4] = (128<<24)|(195<<16)|(136<<8)|166;
-               mid[5] = (128<<24)|(195<<16)|(136<<8)|167;
-               mid[6] = (128<<24)|(195<<16)|(136<<8)|168;
-               mid[7] = (128<<24)|(195<<16)|(136<<8)|169;
-
-               atomic {
-                       firstmachine = global new GlobalString(fm);
-                       firstpage = global new GlobalString("");
-      
-                       works = global new Work[NUM_THREADS];
-                       qt = global new QueryTask[NUM_THREADS];
-                       currentWorkList = global new GlobalQuery[NUM_THREADS];
-                       
-                       GlobalQuery firstquery = global new GlobalQuery(firstmachine, firstpage);
-
-                       GlobalQueue todoList = global new GlobalQueue();
-                       DistributedHashMap visitedList = global new DistributedHashMap(500, 500, 0.75f);
-                       //DistributedHashMap visitedList = global new DistributedHashMap(500, 0.75f);
-                       DistributedHashMap results = global new DistributedHashMap(100, 100, 0.75f);
-                       //DistributedHashMap results = global new DistributedHashMap(100, 0.75f);
-                       DistributedLinkedList results_list = global new DistributedLinkedList();
-                       
-                       todoList.push(firstquery);
-
-                       for (i = 0; i < NUM_THREADS; i++) {
-                               qt[i] = global new QueryTask(todoList, visitedList, maxDepth, maxSearchDepth, results, results_list);
-                               works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList);
-                       }
-               }
-               System.printString("Finished to create Objects\n");
+    mid[0] = (128<<24)|(195<<16)|(136<<8)|162;
+    mid[1] = (128<<24)|(195<<16)|(136<<8)|163;
+    mid[2] = (128<<24)|(195<<16)|(136<<8)|164;
+    mid[3] = (128<<24)|(195<<16)|(136<<8)|165;
+    mid[4] = (128<<24)|(195<<16)|(136<<8)|166;
+    mid[5] = (128<<24)|(195<<16)|(136<<8)|167;
+    mid[6] = (128<<24)|(195<<16)|(136<<8)|168;
+    mid[7] = (128<<24)|(195<<16)|(136<<8)|169;
+    
+    TaskSet ts;
+    atomic {
+      //set up workers
+      ts=global new TaskSet(NUM_THREADS);
+      for (i = 0; i < NUM_THREADS; i++) {
+       ts.threads[i] = global new Worker(ts,i);
+      }
+    }
 
-               Work tmp;
-               for (i = 0; i < NUM_THREADS; i++) {
-                       atomic {
-                               tmp = works[i];
-                       }
-                       Thread.myStart(tmp, mid[i]);
-               }
+    atomic {
+      GlobalString firstmachine = global new GlobalString(fm);
+      GlobalString firstpage = global new GlobalString("");
+      DistributedHashMap visitedList = global new DistributedHashMap(500, 500, 0.75f);
+      DistributedHashMap results = global new DistributedHashMap(100, 100, 0.75f);
+      DistributedLinkedList results_list = global new DistributedLinkedList();
+      QueryTask firstquery = global new QueryTask(visitedList, maxDepth, maxSearchDepth, results, results_list, firstmachine, firstpage, 0);
+      ts.todo.push(firstquery);
+    }
 
-               for (i = 0; i < NUM_THREADS; i++) {
-                       atomic {
-                               tmp = works[i];
-                       }
-                       tmp.join();
-               }
-       }
+    System.printString("Finished to create Objects\n");
+    
+    
+    Worker tmp;
+    for (i = 0; i < NUM_THREADS; i++) {
+      atomic {
+       tmp = ts.threads[i];
+      }
+      Thread.myStart(tmp, mid[i]);
+    }
+    for (i = 0; i < NUM_THREADS; i++) {
+      atomic {
+       tmp = ts.threads[i];
+      }
+      tmp.join();
+    }
+  }
 }
index 88e489b40a89dbce15771542e881eb8e66cc24c4..98039679cbc8fe8f02787ebcb88db5a1509255e2 100644 (file)
@@ -7,7 +7,7 @@ FLAGS=-recoverystats -recovery -transstats -dsmcaching -dsm -dsmtask -debug -noo
 DSMFLAGS=-dsm -dsmtask -sandbox -transstats -debug -nooptimize -mainclass ${MAINCLASS}
 default:
 #      ../../../../buildscript ${FLAGS} -o ${MAINCLASS} ${SRC2} ${SRC3} ${SRC1}
-       ../../../../buildscript ${DSMFLAGS} -o ${MAINCLASS}DSM ${SRC2} ${SRC3} ${SRC1}
+       ../../../../buildscript ${DSMFLAGS} -o ${MAINCLASS}DSM *.java
 
 clean:
        rm -rf tmpbuilddirectory