recovery web crawler
authorjihoonl <jihoonl>
Wed, 7 Oct 2009 19:24:38 +0000 (19:24 +0000)
committerjihoonl <jihoonl>
Wed, 7 Oct 2009 19:24:38 +0000 (19:24 +0000)
Robust/src/Benchmarks/Spider/dsm/Query.java
Robust/src/Benchmarks/Spider/dsm/QueryList.java
Robust/src/Benchmarks/Spider/dsm/QueryThread.java
Robust/src/Benchmarks/Spider/dsm/Spider.java
Robust/src/Benchmarks/Spider/dsm/dstm.conf
Robust/src/Benchmarks/Spider/dsm/makefile

index 6cf78ad0e75a7f6bfcaa5a8ad47e4a96e9ebb69d..7812fff777b236f60e6661b285c0f88bd03e9882 100644 (file)
@@ -1,14 +1,18 @@
 public class Query {
   GlobalString hostname;
   GlobalString path;
-  GlobalStringBuffer response;
+       int depth;
   
-  public Query(GlobalString hostname, GlobalString path) {
+  public Query(GlobalString hostname, GlobalString path, int depth) {
     this.hostname = global new GlobalString(hostname);
     this.path = global new GlobalString(path);
-    response = global new GlobalStringBuffer();
+               this.depth = depth;
   }
 
+       public int getDepth() {
+               return depth;
+       }
+       
   public GlobalString getHostName() {
     return hostname;
   }
@@ -16,22 +20,6 @@ public class Query {
   public GlobalString getPath() {
     return path;
   }
-   
-  public void outputFile() {
-               StringBuffer sb = new StringBuffer(hostname.toLocalString());
-               sb.append(path.toLocalString());
-    FileOutputStream fos = new FileOutputStream(sb.toString().replace('/','#'));
-    fos.write(response.toLocalString().getBytes());
-    fos.close();
-  }
-       
-
-  public GlobalString makewebcanonical(GlobalString page) {
-    GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page));
-    b.append("/");
-               b.append(getPathName(page));
-    return b.toGlobalString();
-  }
 
   public GlobalString getHostName(GlobalString page) {
     GlobalString http = global new GlobalString("http://");
index fa4a9fff662653875101c454fb4fcb5c347db7e6..d09167b079e1272c91d166a18a3c36a1735a1697 100644 (file)
@@ -1,6 +1,8 @@
 public class QueryList extends Queue {
+       Queue queries;
+
   public QueryList() {
-               Queue();                        // ??
+               queries = global new Queue();
   }
 
   public boolean checkQuery(GlobalString x) {
@@ -13,4 +15,8 @@ public class QueryList extends Queue {
                }
                return set;
   }
+
+       public void addQuery(GlobalString x) {
+               queries.push(x);
+       }
 }
index d9dc3690260a0298504c65b9a16e51e67ccf4899..7d6e3530ed75ea56fa51c29ebbf96559e2f95f6b 100644 (file)
@@ -1,67 +1,83 @@
 public class QueryThread extends Task {
        int maxDepth;
-       int depthCnt;
        int maxSearchDepth;
-       int searchDepthCnt;
 
-  public QueryThread(Queue qq, Queue ql, int depth, int searchDepth) {
-    this.todoList = qq;
-               this.doneList = ql;
-               this.maxDepth = depth;
-               this.maxSearchDepth = searchDepth;
-               depthCnt = 1;
-               searchDepthCnt = 0;
+  public QueryThread(Queue todoList, Queue doneList, int maxDepth, int maxSearchDepth) {
+    this.todoList = todoList;
+               this.doneList = doneList;
+               this.maxDepth = maxDepth;
+               this.maxSearchDepth = maxSearchDepth;
   }
 
-  public void execute(Object mywork) {
-               Query q = (Query)mywork;
-               GlobalString ghostname;
-               GlobalString gpath;
-
+  public void execute() {
+               int depth;
+    int max;
+    int maxSearch;
+               
                atomic {
-                       ghostname = q.getHostName();
-                       gpath = q.getPath();
+                       depth = ((Query)myWork).getDepth();
+      max = this.maxDepth;
+      maxSearch = this.maxSearchDepth;
                }
 
-               String hostname = new String(GlobalString.toLocalCharArray(ghostname));
-               String path = new String(GlobalString.toLocalCharArray(gpath));
+               if (depth < max) {
+                       /* global variables */
+                       Query q;
+                       GlobalString ghostname;
+                       GlobalString gpath;
 
-               System.printString("Processing ");
-               System.printString(hostname + "\n");
-               System.printString(" ");
-               System.printString(path);
-               System.printString("\n");
+                       /* local variables */
+                       QueryQueue toprocess;
+                       LocalQuery lq;
+                       String hostname;
+                       String path;
 
-               Socket s = new Socket(hostname, 80);
+                       atomic {
+                               q = (Query)myWork;
+                               ghostname = q.getHostName();
+                               gpath = q.getPath();
+                               hostname = new String(GlobalString.toLocalCharArray(ghostname));
+                               path = new String(GlobalString.toLocalCharArray(gpath));
+                       }
+                       lq = new LocalQuery(hostname, path, depth);
 
-               requestQuery(hostname, path, s);
-//             System.printString("Wait for 5 secs\n");
-//             Thread.sleep(2000000);
+                       System.printString("Processing - Hostname : ");
+                       System.printString(hostname);
+                       System.printString(", Path : ");
+                       System.printString(path);
+                       System.printString("\n");
 
-               readResponse(q, s);
-//             System.printString("Wait for 5 secs\n");
-//             Thread.sleep(2000000);
+                       Socket s = new Socket(hostname, 80);
+    
+                       requestQuery(hostname, path, s);
+                       readResponse(lq, s);
+                       toprocess = processPage(lq,maxSearch);
+                       s.close();
 
-               q.outputFile();
-//             System.printString("Wait for 5 secs\n");
-//             Thread.sleep(2000000);
+                       atomic {
+                               while(!toprocess.isEmpty()) {
+                                       lq = toprocess.pop();
+                                       ghostname = global new GlobalString(lq.getHostName());
+                                       gpath = global new GlobalString(lq.getPath());
 
-               processPage(q, (QueryList)doneList);
-               s.close();
+                                       q = global new Query(ghostname, gpath, lq.getDepth());
+                                       todoList.push(q);
+                               }
+                       }
+               }
   }
        
-       public void requestQuery(String hostname, String path, Socket sock) {
+       public static void requestQuery(String hostname, String path, Socket sock) {
     StringBuffer req = new StringBuffer("GET "); 
     req.append("/");
                req.append(path);
     req.append(" HTTP/1.1\r\nHost:");
     req.append(hostname);
     req.append("\r\n\r\n");
-               System.printString("req : " + req + "\n");
     sock.write(req.toString().getBytes());
   }
 
-       public void readResponse(Query q, Socket sock) {
+       public static void readResponse(LocalQuery lq, Socket sock) {
        //    state 0 - nothing
        //    state 1 - \r
        //    state 2 - \r\n
@@ -114,7 +130,7 @@ public class QueryThread extends Task {
           return;
         else {
           String curr=(new String(buffer)).subString(0,numchars);
-                                       q.response.append(curr);
+                                       lq.response.append(curr);
         }
       }
     }
@@ -122,48 +138,38 @@ public class QueryThread extends Task {
        
        public void done(Object obj) {
                doneList.push(obj);
-//             System.printString("Size of todoList : " + todoList.size() + "\n");
-//             Thread.sleep(5000000);
        }
 
-  public void processPage(Query q, QueryList doneList) {
+  public static QueryQueue processPage(LocalQuery lq,int maxSearchDepth) {
     int index = 0;
        String href = new String("href=\"");
-       String searchstr = q.response.toLocalString();
+       String searchstr = lq.response.toString();
+               int depth;
        boolean cont = true;
 
+               QueryQueue toprocess = new QueryQueue();
+               depth = lq.getDepth() + 1;
+
+               int searchDepthCnt = 0;
                while(cont && (searchDepthCnt < maxSearchDepth)) {
                        int mindex = searchstr.indexOf(href,index);
                        if (mindex != -1) {     
                                int endquote = searchstr.indexOf('"', mindex+href.length());
                if (endquote != -1) {
                      String match = searchstr.subString(mindex+href.length(), endquote);
-                                       GlobalString gmatch;
-                                       GlobalString gmatch2;
+                                       String match2 = lq.makewebcanonical(match);
+       
+                     if (match2 != null) {
+                                               LocalQuery newlq = new LocalQuery(lq.getHostName(match), lq.getPathName(match), depth);
 
-                                       atomic {
-                                               gmatch = global new GlobalString(match);
-                                               gmatch2 = q.makewebcanonical(gmatch);
-                                       }
-                     if (gmatch2 != null && !doneList.checkQuery(gmatch2)) {
-//                                             doneList.push(gmatch2);
-                                               done(gmatch2);
-                                               if (depthCnt < maxDepth) {
-                                                       Query newq;
-                                                       System.printString("Depth : " + depthCnt + "\n");
-                                                       atomic {
-                                                               newq = global new Query(q.getHostName(gmatch), q.getPathName(gmatch));
-                                                               todoList.push(newq);
-                                                               System.printString("Size of todoList : " + todoList.size() + "\n");
-                                                               searchDepthCnt++;
-                                                       }
-                                               }
+                                               toprocess.push(newlq);
+                                               searchDepthCnt++;
                                        }
-                     index = endquote;
+                                       index = endquote;
         } else cont = false;
       } else cont = false;
     }
-               depthCnt++;
-               searchDepthCnt = 0;
+
+               return toprocess;
   }
 }
index 66504dc7aed17b25731c16e84e584c0220d07730..6163e1e06014b75e2f92a99e6d927050d6ef39a8 100644 (file)
@@ -1,39 +1,8 @@
 public class Spider {
-       public static int[] getMID (int num_threads) {
-               int[] mid = new int[num_threads];
-
-               FileInputStream ifs = new FileInputStream("dstm.conf");
-               String str;
-               String sub;
-               int fromIndex = 0;
-               int endIndex = 0;
-               int[] tmp = new int[4];
-
-               for (int i = 0; i < num_threads; i++) { 
-                       str = ifs.readLine();
-                       endIndex = str.indexOf('.', fromIndex);
-                       sub = str.subString(fromIndex, endIndex);
-
-                       fromIndex = endIndex + 1;
-                       endIndex = str.indexOf('.', fromIndex);
-                       sub = str.subString(fromIndex, endIndex);
-
-                       fromIndex = endIndex + 1;
-                       endIndex = str.indexOf('.', fromIndex);
-                       sub = str.subString(fromIndex, endIndex);
-
-                       fromIndex = endIndex + 1;
-                       sub = str.subString(fromIndex);
-
-                       fromIndex = 0;
-               }
-               return mid;
-       }
-
        public static void main(String[] args) {
-               int NUM_THREADS = 3;
-               int depth = 5;
-               int searchDepth = 5;
+               int NUM_THREADS = 4;
+               int maxDepth = 5;
+               int searchDepth = 10;
                int i, j;
                Work[] works;
                QueryThread[] qt;
@@ -43,13 +12,17 @@ public class Spider {
                GlobalString firstmachine;
                GlobalString firstpage;
 
-               int[] mid = getMID(NUM_THREADS);
+//             int[] mid = getMID(NUM_THREADS);
+               int mid[] = new int[NUM_THREADS];
+/*             mid[0] = (128<<24)|(195<<16)|(180<<8)|21;        //dc-4
+               mid[1] = (128<<24)|(195<<16)|(180<<8)|24;        //dc-5
+               mid[2] = (128<<24)|(195<<16)|(180<<8)|26;        //dc-6
+    */
+               mid[0] = (128<<24)|(195<<16)|(136<<8)|162;       //dc-1
+               mid[1] = (128<<24)|(195<<16)|(136<<8)|163;       //dc-2
+               mid[2] = (128<<24)|(195<<16)|(136<<8)|164;       //dc-3
+               mid[3] = (128<<24)|(195<<16)|(136<<8)|165;       //dc-3
 
-/*             int mid[] = new int[NUM_THREADS];
-               mid[0] = (128<<24)|(195<<16)|(136<<8)|166;       //dc-4
-               mid[1] = (128<<24)|(195<<16)|(136<<8)|167;       //dc-5
-               mid[2] = (128<<24)|(195<<16)|(136<<8)|168;       //dc-6
-*/
                atomic {
                        firstmachine = global new GlobalString(args[1]);
                        firstpage = global new GlobalString(args[2]);
@@ -58,14 +31,14 @@ public class Spider {
                        qt = global new QueryThread[NUM_THREADS];
                        currentWorkList = global new Query[NUM_THREADS];
                        
-                       Query firstquery = global new Query(firstmachine, firstpage);
+                       Query firstquery = global new Query(firstmachine, firstpage, 0);
 
                        Queue todoList = global new Queue();
+                       Queue doneList = global new Queue();
                        todoList.push(firstquery);
-                       QueryList doneList = global new QueryList();
 
                        for (i = 0; i < NUM_THREADS; i++) {
-                               qt[i] = global new QueryThread(todoList, doneList, depth, searchDepth);
+                               qt[i] = global new QueryThread(todoList, doneList, maxDepth, searchDepth);
                                works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList);
                        }
                }
@@ -76,7 +49,7 @@ public class Spider {
                        atomic {
                                tmp = works[i];
                        }
-                       tmp.start(mid[i]);
+                       Thread.myStart(tmp, mid[i]);
                }
 
                for (i = 0; i < NUM_THREADS; i++) {
@@ -85,9 +58,45 @@ public class Spider {
                        }
                        tmp.join();
                }
+       }
+
+       public static int[] getMID (int num_threads) {
+               int[] mid = new int[num_threads];
+
+               FileInputStream ifs = new FileInputStream("dstm.conf");
+               String str;
+               String sub;
+               int fromIndex;
+               int endIndex;
+               double num;
+
+               for (int i = 0; i < num_threads; i++) { 
+                       int power = 3 - i;
+                       fromIndex = 0;
+                       num = 0;
+
+                       str = ifs.readLine();
+
+                       endIndex = str.indexOf('.', fromIndex);
+                       sub = str.subString(fromIndex, endIndex);
+                       num += (Integer.parseInt(sub) << 24);
+
+                       fromIndex = endIndex + 1;
+                       endIndex = str.indexOf('.', fromIndex);
+                       sub = str.subString(fromIndex, endIndex);
+                       num += (Integer.parseInt(sub) << 16);
+
+                       fromIndex = endIndex + 1;
+                       endIndex = str.indexOf('.', fromIndex);
+                       sub = str.subString(fromIndex, endIndex);
+                       num += (Integer.parseInt(sub) << 8);
 
-//             while(true)
-//                     Thread.sleep(1000000);
+                       fromIndex = endIndex + 1;
+                       sub = str.subString(fromIndex);
+                       num += Integer.parseInt(sub);
 
+                       mid[i] = (int)num;
+               }
+               return mid;
        }
 }
index 6b3f3e964939055145e78205b5373e4cc5a1884e..4a1050b0689adb4278147ea8562bbb977d9261f4 100644 (file)
@@ -1,3 +1,9 @@
-128.195.136.166
-128.195.136.167
-128.195.136.168
+#128.195.180.21
+#128.195.180.24
+#128.195.180.26
+128.195.136.162
+128.195.136.163
+128.195.136.164
+128.195.136.165
+#128.195.136.166
+#128.195.136.167
index 0e561faffe39c53ff20657bc149e4f11ab08a26e..35b0e708abc82aa00cc159db0d90c7e598476372 100644 (file)
@@ -2,7 +2,7 @@ MAINCLASS=Spider
 SUBCLASS=Query
 SRC1=${MAINCLASS}.java
 SRC2=${SUBCLASS}.java
-SRC3=${SUBCLASS}List.java
+SRC3=${SUBCLASS}Queue.java
 SRC4=${SUBCLASS}Thread.java
 FLAGS= -dsm -32bit -nooptimize -debug -recovery -mainclass ${MAINCLASS}
 default:
@@ -11,3 +11,8 @@ default:
 clean:
        rm -rf tmpbuilddirectory
        rm *.bin
+       rm *.php
+       rm *.css
+       rm www*
+       rm eee*
+       rm web*