updates
authorjihoonl <jihoonl>
Wed, 4 Nov 2009 23:40:47 +0000 (23:40 +0000)
committerjihoonl <jihoonl>
Wed, 4 Nov 2009 23:40:47 +0000 (23:40 +0000)
Robust/src/Benchmarks/Spider/dsm/LocalQuery.java
Robust/src/Benchmarks/Spider/dsm/QueryThread.java
Robust/src/Benchmarks/Spider/dsm/Spider.java
Robust/src/Benchmarks/Spider/dsm/dstm.conf
Robust/src/Benchmarks/Spider/dsm/makefile

index 2315b1e537910453360faa3f4aa26ffec443b829..1beeadbe305c51c05b060dff598b6638ca349b47 100644 (file)
@@ -40,23 +40,37 @@ public class LocalQuery {
 
        public String getHostName(String page) {
                String http = new String("http://");
-               if (page.indexOf(http) == -1) {
+               String https = new String("https://");
+               int beginindex;
+               int endindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
                        return getHostName();
-               } else {
-                       int beginindex = page.indexOf(http) + http.length();
-                       int endindex = page.indexOf('/',beginindex+1);
-                       if ((beginindex == -1)) {
-                               System.printString("ERROR");
-                       }
-                       if (endindex == -1)
-                               endindex=page.length();
-                       return page.subString(beginindex, endindex);
+               } 
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
                }
+               endindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex == -1)) {
+                       System.printString("ERROR");
+               }
+               if (endindex == -1)
+                       endindex = page.length();
+
+               return page.subString(beginindex, endindex);
        }
 
        public String getPathName(String page) {
                String http = new String("http://");
-               if (page.indexOf(http) == -1) {
+               String https = new String("https://");
+               int beginindex;
+               int nextindex;
+
+               if ((page.indexOf(http) == -1) && (page.indexOf(https) == -1)) {
                        String path = getPath();
                        int lastindex = path.lastindexOf('/');
                        if (lastindex == -1)
@@ -65,12 +79,17 @@ public class LocalQuery {
                        StringBuffer sb = new StringBuffer(path.subString(0,lastindex+1));
                        sb.append(page);
                        return sb.toString();
-               } else {
-                       int beginindex = page.indexOf(http) + http.length();
-                       int nextindex = page.indexOf('/',beginindex+1);
-                       if ((beginindex==-1) || (nextindex==-1))
-                               return new String("index.html");
-                       return page.subString(nextindex+1, page.length());
                }
+               else if (page.indexOf(https) != -1) {
+                       beginindex = page.indexOf(https) + https.length();
+               }
+               else {
+                       beginindex = page.indexOf(http) + http.length();
+               }
+               nextindex = page.indexOf('/',beginindex+1);
+
+               if ((beginindex==-1) || (nextindex==-1))
+                       return new String("index.html");
+               return page.subString(nextindex+1, page.length());
        }
 }
index 71dc42527975414ba318512278c9f6f9d9b1e4fc..77db3cd74454ecadec227f12851bc4214f1c48e7 100644 (file)
@@ -1,18 +1,22 @@
 public class QueryThread extends Thread {
-       int maxDepth;
-       int maxSearchDepth;
   int MY_MID;
   int NUM_THREADS;
   Queue todoList;
-  Queue doneList;
-  Query myWork;
-  Query[] currentWorkList;
+  DistributedHashMap doneList;
+  GlobalQuery myWork;
+  GlobalQuery[] currentWorkList;
+
+  DistributedHashMap results;
+  Queue toprocess;
+  GlobalString gTitle;
+  GlobalString workingURL;
+       int maxDepth;
 
-  public QueryThread(Queue todoList, Queue doneList, int maxDepth, int maxSearchDepth,int mid,int NUM_THREADS,Query[] currentWorkList) {    
+  public QueryThread(Queue todoList, DistributedHashMap doneList, DistributedHashMap results,int maxDepth,int mid,int NUM_THREADS,GlobalQuery[] currentWorkList) {    
     this.todoList = todoList;
                this.doneList = doneList;
+    this.results = results;
                this.maxDepth = maxDepth;
-               this.maxSearchDepth = maxSearchDepth;
     this.currentWorkList = currentWorkList;
     this.MY_MID = mid;
     this.NUM_THREADS = NUM_THREADS;
@@ -32,7 +36,7 @@ public class QueryThread extends Thread {
 
     while(true) {
       atomic {
-        myWork = (Query)todoList.pop();
+        myWork = (GlobalQuery)todoList.pop();
         
         if(null == myWork)  // no work in todolist
         {
@@ -45,10 +49,10 @@ public class QueryThread extends Thread {
       }
 
       if(chk == 1) { // it has query
-        execute(this);
+        QueryThread.execute(this);
 
         atomic {
-          doneWork(myWork);
+          done(myWork);
           currentWorkList[workMID] = null;
         }
       }
@@ -102,40 +106,37 @@ public class QueryThread extends Thread {
 
   public static void execute(QueryThread qt) {
                int depth;
-    int max;
-    int maxSearch;
-
-    atomic {
-      if(qt.myWork == null) {
-        System.out.println("What!!!!!!!!!!!!!!!");
-        System.exit(0);
-      }
-                       depth = ((Query)qt.myWork).getDepth();
+               int max;
+               
+               atomic {
+                       depth = qt.myWork.getDepth();
       max = qt.maxDepth;
-      maxSearch = qt.maxSearchDepth;
                }
 
                if (depth < max) {
                        /* global variables */
-                       Query q;
-                       GlobalString ghostname;
-                       GlobalString gpath;
+                       GlobalQuery gq;
 
                        /* local variables */
-                       QueryQueue toprocess;
                        LocalQuery lq;
                        String hostname;
                        String path;
+                       String title;
 
                        atomic {
-                               q = (Query)(qt.myWork);
-                               ghostname = q.getHostName();
-                               gpath = q.getPath();
-                               hostname = new String(GlobalString.toLocalCharArray(ghostname));
-                               path = new String(GlobalString.toLocalCharArray(gpath));
+                               gq = qt.myWork;
+                               hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
+                               path = new String(GlobalString.toLocalCharArray(gq.getPath()));
+
+                               GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+                               gsb.append("/");
+                               gsb.append(path);
+                               qt.workingURL = global new GlobalString(gsb.toGlobalString());
+                               qt.gTitle = null;
                        }
                        lq = new LocalQuery(hostname, path, depth);
 
+                       System.printString("["+lq.getDepth()+"] ");
                        System.printString("Processing - Hostname : ");
                        System.printString(hostname);
                        System.printString(", Path : ");
@@ -146,23 +147,91 @@ public class QueryThread extends Thread {
     
                        requestQuery(hostname, path, s);
                        readResponse(lq, s);
-                       toprocess = processPage(lq,maxSearch);
-                       s.close();
-
-                       atomic {
-                               while(!toprocess.isEmpty()) {
-                                       lq = toprocess.pop();
-                                       ghostname = global new GlobalString(lq.getHostName());
-                                       gpath = global new GlobalString(lq.getPath());
 
-                                       q = global new Query(ghostname, gpath, lq.getDepth());
-                                       qt.todoList.push(q);
+                       if ((title = grabTitle(lq)) != null) {
+                               atomic {
+                                       qt.gTitle = global new GlobalString(title);
                                }
                        }
+
+                       atomic {
+                               qt.toprocess = processPage(lq);
+                       }
+
+                       s.close();
                }
   }
-       
-       public static void requestQuery(String hostname, String path, Socket sock) {
+
+       public void done(Object obj) {
+               if (gTitle != null) 
+                       processList();
+
+               GlobalString str = global new GlobalString("true");
+
+               doneList.put(workingURL, str);
+
+               while(!toprocess.isEmpty()) {
+                       GlobalQuery q = (GlobalQuery)toprocess.pop();
+
+                       GlobalString hostname = global new GlobalString(q.getHostName());
+                       GlobalString path = global new GlobalString(q.getPath());
+
+                       GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
+                       gsb.append("/");
+                       gsb.append(path);
+
+                       if (!doneList.containsKey(gsb.toGlobalString())) {
+                               todoList.push(q);
+                       }
+               }
+       }
+
+       public static String grabTitle(LocalQuery lq) {
+               String sTitle = new String("<title>");  
+               String eTitle = new String("</title>");
+       String searchstr = lq.response.toString();
+               String title = null;
+               char ch;
+
+               int mindex = searchstr.indexOf(sTitle);
+               if (mindex != -1) {
+                       int endquote = searchstr.indexOf(eTitle, mindex+sTitle.length());
+
+                       title = new String(searchstr.subString(mindex+sTitle.length(), endquote));
+                       
+                       if (Character.isWhitespace(title.charAt(0))){
+                               mindex=0;
+                               while (Character.isWhitespace(title.charAt(mindex++)));
+                               mindex--;
+                               title = new String(title.subString(mindex));
+                       }
+
+                       if (Character.isWhitespace(title.charAt(title.length()-1))) {
+                               endquote=title.length()-1;
+                               while (Character.isWhitespace(title.charAt(endquote--)));
+                               endquote += 2;
+                               title = new String(title.subString(0, endquote));
+                       }
+
+                       if (errorPage(title)) 
+                               title = null;
+               }
+
+               return title;
+       }
+
+       public static boolean errorPage(String str) {
+               if (str.equals("301 Moved Permanently"))     
+                       return true;                               
+               else if (str.equals("302 Found"))            
+                       return true;                               
+               else if (str.equals("404 Not Found"))        
+                       return true;                               
+               else                                         
+                       return false;                              
+       }                                              
+  
+  public static void requestQuery(String hostname, String path, Socket sock) {
     StringBuffer req = new StringBuffer("GET "); 
     req.append("/");
                req.append(path);
@@ -231,22 +300,107 @@ public class QueryThread extends Thread {
     }
   }
        
-       public void doneWork(Object obj) {
-               doneList.push(obj);
+       public void processList() {
+               LinkedList ll;
+               GlobalString token = null;
+               int mindex = 0;
+               int endquote = 0;
+
+               while (endquote != -1) {
+                       endquote = gTitle.indexOf(' ', mindex);
+
+                       if (endquote != -1) {
+                               token = gTitle.subString(mindex, endquote);
+                               mindex = endquote + 1;
+                               if (filter(token)) {
+                                       continue;
+                               }
+                               token = refine(token);
+                       }
+                       else {
+                               token = gTitle.subString(mindex);
+                               token = refine(token);
+                       }
+
+                       Queue q = (Queue)results.get(token);
+                       if (q == null) {
+                               q = global new Queue();
+                       }
+                       q.push(workingURL);     
+                       results.put(token, q);
+                       System.out.println("Key : ["+token.toLocalString()+"],["+q.size()+"]");
+               }
+       }
+
+       public boolean filter(GlobalString str) {
+               if (str.equals("of"))   return true;
+               else if (str.equals("for")) return true;
+               else if (str.equals("a")) return true;
+               else if (str.equals("an")) return true;
+               else if (str.equals("the")) return true;
+               else if (str.equals("at")) return true;
+               else if (str.equals("and")) return true;
+               else if (str.equals("or")) return true;
+               else if (str.equals("but")) return true;
+               else if (str.equals("to")) return true;
+               else if (str.equals(".")) return true;
+               else if (str.equals("=")) return true;
+               else if (str.equals("-")) return true;
+               else if (str.equals(":")) return true;
+               else if (str.equals(";")) return true;
+               else if (str.equals("\'")) return true;
+               else if (str.equals("\"")) return true;
+               else if (str.equals("|")) return true;
+               else if (str.equals("@")) return true;
+               else if (str.equals("&")) return true;
+               else return false;
        }
 
-  public static QueryQueue processPage(LocalQuery lq,int maxSearchDepth) {
+       public GlobalString refine(GlobalString str) {
+               str = refinePrefix(str);
+               str = refinePostfix(str);
+               return str;
+       }
+
+       public GlobalString refinePrefix(GlobalString str) {
+               if (str.charAt(0) == '&') {             // &
+                       return str.subString(1);
+               }
+               return str;
+       }
+
+       public GlobalString refinePostfix(GlobalString str) {
+               if (str.charAt(str.length()-1) == ',') {                        // ,
+                       return str.subString(0, str.length()-1);
+               }
+               else if (str.charAt(str.length()-1) == ':') {           // :
+                       return str.subString(0, str.length()-1);
+               }
+               else if (str.charAt(str.length()-1) == ';') {           // ;
+                       return str.subString(0, str.length()-1);
+               }
+               else if (str.charAt(str.length()-1) == '!') {           // !
+                       return str.subString(0, str.length()-1);
+               }
+               else if (str.charAt(str.length()-1) == 's') {                   // 's
+                       if (str.charAt(str.length()-2) == '\'')
+                               return str.subString(0, str.length()-2);        
+               }
+               return str;
+       }
+  
+  public static Queue processPage(LocalQuery lq) {
     int index = 0;
        String href = new String("href=\"");
        String searchstr = lq.response.toString();
                int depth;
        boolean cont = true;
+               Queue toprocess;
 
-               QueryQueue toprocess = new QueryQueue();
                depth = lq.getDepth() + 1;
 
-               int searchDepthCnt = 0;
-               while(cont && (searchDepthCnt < maxSearchDepth)) {
+               toprocess = global new Queue();
+               while(cont) {
                        int mindex = searchstr.indexOf(href,index);
                        if (mindex != -1) {     
                                int endquote = searchstr.indexOf('"', mindex+href.length());
@@ -254,17 +408,20 @@ public class QueryThread extends Thread {
                      String match = searchstr.subString(mindex+href.length(), endquote);
                                        String match2 = lq.makewebcanonical(match);
        
-                     if (match2 != null) {
-                                               LocalQuery newlq = new LocalQuery(lq.getHostName(match), lq.getPathName(match), depth);
+                                       GlobalString ghostname;
+                                       GlobalString gpath;
+
+                                       ghostname = global new GlobalString(lq.getHostName(match));
+                                       gpath = global new GlobalString(lq.getPathName(match));
 
-                                               toprocess.push(newlq);
-                                               searchDepthCnt++;
+                     if (match2 != null) {
+                                                       GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);
+                                                       toprocess.push(gq);
                                        }
                                        index = endquote;
         } else cont = false;
       } else cont = false;
-    }
-
+    }                                                                          
                return toprocess;
   }
 }
index 1ba9f82b05b93ce9fa6ed76a04c8d7dbc569f94a..368ec4e935f1c1d44c30ea5d3f9073f69fb95ea1 100644 (file)
@@ -1,17 +1,19 @@
 public class Spider {
        public static void main(String[] args) {
-               int NUM_THREADS = 4;
-               int maxDepth = 5;
-               int searchDepth = 10;
+               int NUM_THREADS = 3;
+    int maxDepth = 3;
                int i, j;
                QueryThread[] qt;
-               Query[] currentWorkList;
+               GlobalQuery[] currentWorkList;
 
                NUM_THREADS = Integer.parseInt(args[0]);
-               GlobalString firstmachine;
-               GlobalString firstpage;
 
-//             int[] mid = getMID(NUM_THREADS);
+    if(args.length == 3) {
+      maxDepth = Integer.parseInt(args[2]);
+    }
+
+    GlobalString firstmachine;
+
                int mid[] = new int[NUM_THREADS];
 /*             mid[0] = (128<<24)|(195<<16)|(180<<8)|21;        //dc-4
                mid[1] = (128<<24)|(195<<16)|(180<<8)|24;        //dc-5
@@ -26,19 +28,20 @@ public class Spider {
 
                atomic {
                        firstmachine = global new GlobalString(args[1]);
-                       firstpage = global new GlobalString(args[2]);
 
                        qt = global new QueryThread[NUM_THREADS];
-                       currentWorkList = global new Query[NUM_THREADS];
+                       currentWorkList = global new GlobalQuery[NUM_THREADS];
                        
-                       Query firstquery = global new Query(firstmachine, firstpage, 0);
+                       GlobalQuery firstquery = global new GlobalQuery(firstmachine);
 
                        Queue todoList = global new Queue();
-                       Queue doneList = global new Queue();
+      DistributedHashMap doneList = global new DistributedHashMap(500,500, 0.75f);
+      DistributedHashMap results = global new DistributedHashMap(100,100,0.75f);
+
                        todoList.push(firstquery);
 
                        for (i = 0; i < NUM_THREADS; i++) {
-                               qt[i] = global new QueryThread(todoList, doneList, maxDepth, searchDepth,i,NUM_THREADS,currentWorkList);
+                               qt[i] = global new QueryThread(todoList, doneList, results,maxDepth, i,NUM_THREADS,currentWorkList);
                        }
                }
                System.printString("Finished to create Objects\n");
index 4a1050b0689adb4278147ea8562bbb977d9261f4..eff447b84ee46f8283d2f55e0e45f4c543ef1333 100644 (file)
@@ -4,6 +4,3 @@
 128.195.136.162
 128.195.136.163
 128.195.136.164
-128.195.136.165
-#128.195.136.166
-#128.195.136.167
index 740b98ebb970e1c1e88a0704c695f2c72e6daafc..81df760f7df0d8e1a29b6c49fed762c56ca75561 100644 (file)
@@ -1,18 +1,12 @@
 MAINCLASS=Spider
 SUBCLASS=Query
 SRC1=${MAINCLASS}.java
-SRC2=${SUBCLASS}.java
-SRC3=${SUBCLASS}Queue.java
-SRC4=${SUBCLASS}Thread.java
+SRC2=Global${SUBCLASS}.java
+SRC3=${SUBCLASS}Thread.java
 FLAGS= -dsm -32bit -nooptimize -debug -mainclass ${MAINCLASS}
 default:
-       ../../../buildscript ${FLAGS} -o ${MAINCLASS} ${SRC2} ${SRC3} ${SRC4} ${SRC1}
+       ../../../buildscript ${FLAGS} -o ${MAINCLASS} ${SRC2} ${SRC3} ${SRC1}
 
 clean:
        rm -rf tmpbuilddirectory
        rm *.bin
-       rm *.php
-       rm *.css
-       rm www*
-       rm eee*
-       rm web*