dsm version
authorjihoonl <jihoonl>
Tue, 10 Nov 2009 08:04:28 +0000 (08:04 +0000)
committerjihoonl <jihoonl>
Tue, 10 Nov 2009 08:04:28 +0000 (08:04 +0000)
Robust/src/Benchmarks/Spider/dsm/Query.java [deleted file]
Robust/src/Benchmarks/Spider/dsm/QueryThread.java
Robust/src/Benchmarks/Spider/dsm/Spider.java
Robust/src/Benchmarks/Spider/dsm/makefile

diff --git a/Robust/src/Benchmarks/Spider/dsm/Query.java b/Robust/src/Benchmarks/Spider/dsm/Query.java
deleted file mode 100644 (file)
index 7812fff..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-public class Query {
-  GlobalString hostname;
-  GlobalString path;
-       int depth;
-  
-  public Query(GlobalString hostname, GlobalString path, int depth) {
-    this.hostname = global new GlobalString(hostname);
-    this.path = global new GlobalString(path);
-               this.depth = depth;
-  }
-
-       public int getDepth() {
-               return depth;
-       }
-       
-  public GlobalString getHostName() {
-    return hostname;
-  }
-  public GlobalString getPath() {
-    return path;
-  }
-
-  public GlobalString getHostName(GlobalString page) {
-    GlobalString http = global new GlobalString("http://");
-    if (page.indexOf(http) == -1) {
-      return getHostName();
-    } else {
-      int beginindex = page.indexOf(http) + http.length();
-           int endindex = page.indexOf('/',beginindex+1);
-           if ((beginindex == -1)) {
-        System.printString("ERROR");
-           }
-           if (endindex == -1)
-        endindex = page.length();
-      return page.subString(beginindex, endindex);
-    }
-  }
-
-  
-       public GlobalString getPathName(GlobalString page) {
-    GlobalString http = global new GlobalString("http://");
-    if (page.indexOf(http) == -1) {
-      GlobalString path = getPath();
-           int lastindex = path.lastindexOf('/');
-           if (lastindex == -1)
-        return page;
-           
-      GlobalStringBuffer sb = global new GlobalStringBuffer(path.subString(0,lastindex+1));
-           sb.append(page);
-      return sb.toGlobalString();
-    } else {
-      int beginindex = page.indexOf(http)+http.length();
-           int nextindex = page.indexOf('/',beginindex+1);
-           if ((beginindex == -1) || (nextindex == -1))
-        return global new GlobalString("index.html");
-      return page.subString(nextindex+1, page.length());
-    }
-  }
-}
index 77db3cd74454ecadec227f12851bc4214f1c48e7..18dee05e73b12968a37aeff67675284a738d5e15 100644 (file)
@@ -15,11 +15,13 @@ public class QueryThread extends Thread {
   public QueryThread(Queue todoList, DistributedHashMap doneList, DistributedHashMap results,int maxDepth,int mid,int NUM_THREADS,GlobalQuery[] currentWorkList) {    
     this.todoList = todoList;
                this.doneList = doneList;
-    this.results = results;
-               this.maxDepth = maxDepth;
     this.currentWorkList = currentWorkList;
     this.MY_MID = mid;
     this.NUM_THREADS = NUM_THREADS;
+
+               this.maxDepth = maxDepth;
+    this.results = results;
+    toprocess = global new Queue();
   }
 
   public void run()
@@ -143,7 +145,17 @@ public class QueryThread extends Thread {
                        System.printString(path);
                        System.printString("\n");
 
-                       Socket s = new Socket(hostname, 80);
+      // check if the url is pdf, ps, ppt, pptx ... etc
+      if(isDocument(path)) {
+        return;
+      }
+
+                       Socket s = new Socket();
+
+      // connection fail
+      if(s.connect(hostname, 80) == -1) {
+        return;
+      }
     
                        requestQuery(hostname, path, s);
                        readResponse(lq, s);
@@ -152,23 +164,38 @@ public class QueryThread extends Thread {
                                atomic {
                                        qt.gTitle = global new GlobalString(title);
                                }
-                       }
-
-                       atomic {
-                               qt.toprocess = processPage(lq);
-                       }
-
+                       atomic {
+                               qt.toprocess = processPage(lq);
+                       }
+      }
                        s.close();
                }
   }
 
+  public static boolean isDocument(String str) {
+               int index = str.lastindexOf('.');
+
+               if (index != -1) {
+                       if ((str.subString(index+1)).equals("pdf")) return true;
+                       else if ((str.subString(index+1)).equals("ps")) return true;
+                       else if ((str.subString(index+1)).equals("ppt")) return true;
+                       else if ((str.subString(index+1)).equals("pptx")) return true;
+                       else if ((str.subString(index+1)).equals("jpg")) return true;
+                       else if ((str.subString(index+1)).equals("mp3")) return true;
+                       else if ((str.subString(index+1)).equals("wmv")) return true;
+                       else if ((str.subString(index+1)).equals("doc")) return true;
+                       else if ((str.subString(index+1)).equals("docx")) return true;
+                       else if ((str.subString(index+1)).equals("mov")) return true;
+                       else if ((str.subString(index+1)).equals("flv")) return true;
+                       else return false;
+               }
+               return false;
+       }
+
        public void done(Object obj) {
-               if (gTitle != null) 
+               if (gTitle != null && (gTitle.length() > 0)
                        processList();
 
-               GlobalString str = global new GlobalString("true");
-
-               doneList.put(workingURL, str);
 
                while(!toprocess.isEmpty()) {
                        GlobalQuery q = (GlobalQuery)toprocess.pop();
@@ -181,52 +208,85 @@ public class QueryThread extends Thread {
                        gsb.append(path);
 
                        if (!doneList.containsKey(gsb.toGlobalString())) {
-                               todoList.push(q);
+        todoList.push(q);
+
+        GlobalString str = global new GlobalString("1");
+               doneList.put(gsb.toGlobalString(), str);
                        }
                }
        }
 
        public static String grabTitle(LocalQuery lq) {
-               String sTitle = new String("<title>");  
-               String eTitle = new String("</title>");
+               String sBrace = new String("<");        
+               String strTitle = new String("title>");
        String searchstr = lq.response.toString();
                String title = null;
                char ch;
 
-               int mindex = searchstr.indexOf(sTitle);
-               if (mindex != -1) {
-                       int endquote = searchstr.indexOf(eTitle, mindex+sTitle.length());
-
-                       title = new String(searchstr.subString(mindex+sTitle.length(), endquote));
-                       
-                       if (Character.isWhitespace(title.charAt(0))){
-                               mindex=0;
-                               while (Character.isWhitespace(title.charAt(mindex++)));
-                               mindex--;
-                               title = new String(title.subString(mindex));
-                       }
-
-                       if (Character.isWhitespace(title.charAt(title.length()-1))) {
-                               endquote=title.length()-1;
-                               while (Character.isWhitespace(title.charAt(endquote--)));
-                               endquote += 2;
-                               title = new String(title.subString(0, endquote));
-                       }
-
-                       if (errorPage(title)) 
-                               title = null;
-               }
-
-               return title;
-       }
+               int mindex = -1;
+               int endquote = -1;
+               int i, j;
+               String tmp;
+
+               for (i = 0; i < searchstr.length(); i++) {
+                       if (searchstr.charAt(i) == '<') {                                       
+                               i++;                                                                
+                               if (searchstr.length() > (i+strTitle.length())) {                   
+                                       tmp = searchstr.subString(i, i+strTitle.length());                
+                                       if (tmp.equalsIgnoreCase("title>")) {                             
+                                               mindex = i + tmp.length();                                      
+                                               for (j = mindex; j < searchstr.length(); j++) {                 
+                                                       if (searchstr.charAt(j) == '<') {                             
+                                                               j++;                                                        
+                                                               tmp = searchstr.subString(j, j+strTitle.length()+1);                      
+                                                               if (tmp.equalsIgnoreCase("/title>")) {                      
+                                                                       endquote = j - 1;                                         
+                                                                       break;                                                    
+                                                               }                                                           
+                                                       }                                                             
+                                               }                                                               
+                                       }                                                                 
+                               }                                                                   
+                       }                                                                     
+               }                                                                       
+                                                                            
+               if (mindex != -1) {                                                     
+                       title = searchstr.subString(mindex, endquote);                        
+                       if (Character.isWhitespace(title.charAt(0))){                         
+                               mindex=0;                                                           
+                               while (Character.isWhitespace(title.charAt(mindex++)));             
+                               mindex--;                                                           
+                               if (mindex >= title.length()) return null;                          
+                               title = new String(title.subString(mindex));                        
+                       }                                                                     
+                                                                            
+                       if (Character.isWhitespace(title.charAt(title.length()-1))) {         
+                               endquote=title.length()-1;                                          
+                               while (Character.isWhitespace(title.charAt(endquote--)));           
+                               endquote += 2;                                                      
+                               if (mindex >= endquote) return null;                                
+                               title = new String(title.subString(0, endquote));                   
+                       }                                                                     
+                                                                            
+                       if (isErrorPage(title)) {                                             
+                               return null;                                                        
+                       }                                                                     
+               }                                                                       
+                                                                            
+               return title;                                                           
+  }
 
-       public static boolean errorPage(String str) {
+       public static boolean isErrorPage(String str) {
                if (str.equals("301 Moved Permanently"))     
                        return true;                               
                else if (str.equals("302 Found"))            
                        return true;                               
                else if (str.equals("404 Not Found"))        
                        return true;                               
+               else if (str.equals("403 Forbidden")) 
+                       return true;
+               else if (str.equals("404 File Not Found")) 
+                       return true;
                else                                         
                        return false;                              
        }                                              
@@ -235,7 +295,7 @@ public class QueryThread extends Thread {
     StringBuffer req = new StringBuffer("GET "); 
     req.append("/");
                req.append(path);
-    req.append(" HTTP/1.1\r\nHost:");
+    req.append(" HTTP/1.0\r\nHost:");
     req.append(hostname);
     req.append("\r\n\r\n");
     sock.write(req.toString().getBytes());
@@ -247,57 +307,17 @@ public class QueryThread extends Thread {
        //    state 2 - \r\n
        //    state 3 - \r\n\r
        //    state 4 - \r\n\r\n
-    int state=0;
-    while(true) {
-      if (state<4) {
-        if (state==0) {
-          byte[] b=new byte[1];
-          int numchars=sock.read(b);
-          if ((numchars==1)) {
-            if (b[0]=='\r') {
-              state++;
-            }
-          } else
-                                               return;
-        } else if (state==1) {
-          byte[] b=new byte[1];
-          int numchars=sock.read(b);
-          if (numchars==1) {
-            if (b[0]=='\n')
-              state++;
-            else
-              state=0;
-          } else return;
-        } else if (state==2) {
-          byte[] b=new byte[1];
-          int numchars=sock.read(b);
-          if (numchars==1) {
-            if (b[0]=='\r')
-              state++;
-            else
-              state=0;
-          } else return;
-        } else if (state==3) {
-          byte[] b=new byte[1];
-          int numchars=sock.read(b);
-          if (numchars==1) {
-            if (b[0]=='\n')
-              state++;
-            else
-              state=0;
-          } else return;
-        }
-      } else {
-                               byte[] buffer=new byte[1024];
-        int numchars=sock.read(buffer);
-        if (numchars==0)
-          return;
-        else {
-          String curr=(new String(buffer)).subString(0,numchars);
-                                       lq.response.append(curr);
-        }
-      }
-    }
+    byte[] buffer = new byte[1024];
+    int numchars;
+  
+    do {
+      numchars = sock.read(buffer);
+  
+         String curr = (new String(buffer)).subString(0, numchars);
+                       
+         lq.response.append(curr);
+         buffer = new byte[1024];
+    } while(numchars > 0);
   }
        
        public void processList() {
@@ -328,33 +348,39 @@ public class QueryThread extends Thread {
                        }
                        q.push(workingURL);     
                        results.put(token, q);
-                       System.out.println("Key : ["+token.toLocalString()+"],["+q.size()+"]");
                }
        }
 
        public boolean filter(GlobalString str) {
-               if (str.equals("of"))   return true;
-               else if (str.equals("for")) return true;
-               else if (str.equals("a")) return true;
-               else if (str.equals("an")) return true;
-               else if (str.equals("the")) return true;
-               else if (str.equals("at")) return true;
-               else if (str.equals("and")) return true;
-               else if (str.equals("or")) return true;
-               else if (str.equals("but")) return true;
-               else if (str.equals("to")) return true;
-               else if (str.equals(".")) return true;
-               else if (str.equals("=")) return true;
-               else if (str.equals("-")) return true;
-               else if (str.equals(":")) return true;
-               else if (str.equals(";")) return true;
-               else if (str.equals("\'")) return true;
-               else if (str.equals("\"")) return true;
-               else if (str.equals("|")) return true;
-               else if (str.equals("@")) return true;
-               else if (str.equals("&")) return true;
-               else return false;
-       }
+         if (str.equals("of")) return true;
+         else if (str.equals("for")) return true;
+               else if (str.equals("a")) return true;
+               else if (str.equals("an")) return true;
+               else if (str.equals("the")) return true;
+               else if (str.equals("at")) return true;
+               else if (str.equals("and")) return true;
+               else if (str.equals("or")) return true;
+               else if (str.equals("but")) return true;
+               else if (str.equals("to")) return true;
+               else if (str.equals("The")) return true;
+               else if (str.length() == 1) {
+                       if (str.charAt(0) == '.') return true;
+                       else if (str.charAt(0) == '.') return true;
+                       else if (str.charAt(0) == '-') return true;
+                       else if (str.charAt(0) == '=') return true;
+                       else if (str.charAt(0) == '_') return true;
+                       else if (str.charAt(0) == ':') return true;
+                       else if (str.charAt(0) == ';') return true;
+                       else if (str.charAt(0) == '\'') return true;
+                       else if (str.charAt(0) == '\"') return true;
+                       else if (str.charAt(0) == '|') return true;
+                       else if (str.charAt(0) == '@') return true;
+                       else if (str.charAt(0) == '&') return true;
+                       else if (str.charAt(0) == ' ') return true;
+               }
+               else 
+        return false;
+  }
 
        public GlobalString refine(GlobalString str) {
                str = refinePrefix(str);
@@ -386,6 +412,11 @@ public class QueryThread extends Thread {
                        if (str.charAt(str.length()-2) == '\'')
                                return str.subString(0, str.length()-2);        
                }
+               else if (str.charAt(str.length()-1) == '-') {
+                       int index = str.length()-2;
+                       while (Character.isWhitespace(str.charAt(index--)));
+                       return str.subString(0, index+2);
+               }
                return str;
        }
   
index 368ec4e935f1c1d44c30ea5d3f9073f69fb95ea1..67be18e04f6efca604450ab5dbf76c76c189431a 100644 (file)
@@ -1,38 +1,35 @@
 public class Spider {
        public static void main(String[] args) {
                int NUM_THREADS = 3;
-    int maxDepth = 3;
+    int maxDepth = 4;
                int i, j;
                QueryThread[] qt;
                GlobalQuery[] currentWorkList;
 
                NUM_THREADS = Integer.parseInt(args[0]);
 
-    if(args.length == 3) {
-      maxDepth = Integer.parseInt(args[2]);
-    }
-
     GlobalString firstmachine;
+    GlobalString firstpage;
 
                int mid[] = new int[NUM_THREADS];
-/*             mid[0] = (128<<24)|(195<<16)|(180<<8)|21;        //dc-4
-               mid[1] = (128<<24)|(195<<16)|(180<<8)|24;        //dc-5
-               mid[2] = (128<<24)|(195<<16)|(180<<8)|26;        //dc-6
-    */
                mid[0] = (128<<24)|(195<<16)|(136<<8)|162;       //dc-1
                mid[1] = (128<<24)|(195<<16)|(136<<8)|163;       //dc-2
                mid[2] = (128<<24)|(195<<16)|(136<<8)|164;       //dc-3
-               mid[3] = (128<<24)|(195<<16)|(136<<8)|165;       //dc-4
-//             mid[4] = (128<<24)|(195<<16)|(136<<8)|166;       //dc-5
-//             mid[5] = (128<<24)|(195<<16)|(136<<8)|167;       //dc-6
 
                atomic {
                        firstmachine = global new GlobalString(args[1]);
 
+      if(args.length == 3) {
+        firstpage = global new GlobalString(args[2]);
+      }
+      else
+        firstpage = global new GlobalString("");
+
+
                        qt = global new QueryThread[NUM_THREADS];
                        currentWorkList = global new GlobalQuery[NUM_THREADS];
                        
-                       GlobalQuery firstquery = global new GlobalQuery(firstmachine);
+                       GlobalQuery firstquery = global new GlobalQuery(firstmachine,firstpage);
 
                        Queue todoList = global new Queue();
       DistributedHashMap doneList = global new DistributedHashMap(500,500, 0.75f);
index 81df760f7df0d8e1a29b6c49fed762c56ca75561..30c3796395922a69a3f3a01b174ef6da5e29a78a 100644 (file)
@@ -3,7 +3,7 @@ SUBCLASS=Query
 SRC1=${MAINCLASS}.java
 SRC2=Global${SUBCLASS}.java
 SRC3=${SUBCLASS}Thread.java
-FLAGS= -dsm -32bit -nooptimize -debug -mainclass ${MAINCLASS}
+FLAGS= -dsm -nooptimize -debug -mainclass ${MAINCLASS}
 default:
        ../../../buildscript ${FLAGS} -o ${MAINCLASS} ${SRC2} ${SRC3} ${SRC1}