From 2a715358e108f1d2fe8b983707ce11358c625a46 Mon Sep 17 00:00:00 2001 From: jihoonl Date: Wed, 7 Oct 2009 19:24:38 +0000 Subject: [PATCH] recovery web crawler --- Robust/src/Benchmarks/Spider/dsm/Query.java | 26 +--- .../src/Benchmarks/Spider/dsm/QueryList.java | 8 +- .../Benchmarks/Spider/dsm/QueryThread.java | 136 +++++++++--------- Robust/src/Benchmarks/Spider/dsm/Spider.java | 101 +++++++------ Robust/src/Benchmarks/Spider/dsm/dstm.conf | 12 +- Robust/src/Benchmarks/Spider/dsm/makefile | 7 +- 6 files changed, 155 insertions(+), 135 deletions(-) diff --git a/Robust/src/Benchmarks/Spider/dsm/Query.java b/Robust/src/Benchmarks/Spider/dsm/Query.java index 6cf78ad0..7812fff7 100644 --- a/Robust/src/Benchmarks/Spider/dsm/Query.java +++ b/Robust/src/Benchmarks/Spider/dsm/Query.java @@ -1,14 +1,18 @@ public class Query { GlobalString hostname; GlobalString path; - GlobalStringBuffer response; + int depth; - public Query(GlobalString hostname, GlobalString path) { + public Query(GlobalString hostname, GlobalString path, int depth) { this.hostname = global new GlobalString(hostname); this.path = global new GlobalString(path); - response = global new GlobalStringBuffer(); + this.depth = depth; } + public int getDepth() { + return depth; + } + public GlobalString getHostName() { return hostname; } @@ -16,22 +20,6 @@ public class Query { public GlobalString getPath() { return path; } - - public void outputFile() { - StringBuffer sb = new StringBuffer(hostname.toLocalString()); - sb.append(path.toLocalString()); - FileOutputStream fos = new FileOutputStream(sb.toString().replace('/','#')); - fos.write(response.toLocalString().getBytes()); - fos.close(); - } - - - public GlobalString makewebcanonical(GlobalString page) { - GlobalStringBuffer b = global new GlobalStringBuffer(getHostName(page)); - b.append("/"); - b.append(getPathName(page)); - return b.toGlobalString(); - } public GlobalString getHostName(GlobalString page) { GlobalString http = global new GlobalString("http://"); diff --git a/Robust/src/Benchmarks/Spider/dsm/QueryList.java b/Robust/src/Benchmarks/Spider/dsm/QueryList.java index fa4a9fff..d09167b0 100644 --- a/Robust/src/Benchmarks/Spider/dsm/QueryList.java +++ b/Robust/src/Benchmarks/Spider/dsm/QueryList.java @@ -1,6 +1,8 @@ public class QueryList extends Queue { + Queue queries; + public QueryList() { - Queue(); // ?? + queries = global new Queue(); } public boolean checkQuery(GlobalString x) { @@ -13,4 +15,8 @@ public class QueryList extends Queue { } return set; } + + public void addQuery(GlobalString x) { + queries.push(x); + } } diff --git a/Robust/src/Benchmarks/Spider/dsm/QueryThread.java b/Robust/src/Benchmarks/Spider/dsm/QueryThread.java index d9dc3690..7d6e3530 100644 --- a/Robust/src/Benchmarks/Spider/dsm/QueryThread.java +++ b/Robust/src/Benchmarks/Spider/dsm/QueryThread.java @@ -1,67 +1,83 @@ public class QueryThread extends Task { int maxDepth; - int depthCnt; int maxSearchDepth; - int searchDepthCnt; - public QueryThread(Queue qq, Queue ql, int depth, int searchDepth) { - this.todoList = qq; - this.doneList = ql; - this.maxDepth = depth; - this.maxSearchDepth = searchDepth; - depthCnt = 1; - searchDepthCnt = 0; + public QueryThread(Queue todoList, Queue doneList, int maxDepth, int maxSearchDepth) { + this.todoList = todoList; + this.doneList = doneList; + this.maxDepth = maxDepth; + this.maxSearchDepth = maxSearchDepth; } - public void execute(Object mywork) { - Query q = (Query)mywork; - GlobalString ghostname; - GlobalString gpath; - + public void execute() { + int depth; + int max; + int maxSearch; + atomic { - ghostname = q.getHostName(); - gpath = q.getPath(); + depth = ((Query)myWork).getDepth(); + max = this.maxDepth; + maxSearch = this.maxSearchDepth; } - String hostname = new String(GlobalString.toLocalCharArray(ghostname)); - String path = new String(GlobalString.toLocalCharArray(gpath)); + if (depth < max) { + /* global variables */ + Query q; + GlobalString ghostname; + GlobalString gpath; - System.printString("Processing "); - System.printString(hostname + "\n"); - System.printString(" "); - System.printString(path); - System.printString("\n"); + /* local variables */ + QueryQueue toprocess; + LocalQuery lq; + String hostname; + String path; - Socket s = new Socket(hostname, 80); + atomic { + q = (Query)myWork; + ghostname = q.getHostName(); + gpath = q.getPath(); + hostname = new String(GlobalString.toLocalCharArray(ghostname)); + path = new String(GlobalString.toLocalCharArray(gpath)); + } + lq = new LocalQuery(hostname, path, depth); - requestQuery(hostname, path, s); -// System.printString("Wait for 5 secs\n"); -// Thread.sleep(2000000); + System.printString("Processing - Hostname : "); + System.printString(hostname); + System.printString(", Path : "); + System.printString(path); + System.printString("\n"); - readResponse(q, s); -// System.printString("Wait for 5 secs\n"); -// Thread.sleep(2000000); + Socket s = new Socket(hostname, 80); + + requestQuery(hostname, path, s); + readResponse(lq, s); + toprocess = processPage(lq,maxSearch); + s.close(); - q.outputFile(); -// System.printString("Wait for 5 secs\n"); -// Thread.sleep(2000000); + atomic { + while(!toprocess.isEmpty()) { + lq = toprocess.pop(); + ghostname = global new GlobalString(lq.getHostName()); + gpath = global new GlobalString(lq.getPath()); - processPage(q, (QueryList)doneList); - s.close(); + q = global new Query(ghostname, gpath, lq.getDepth()); + todoList.push(q); + } + } + } } - public void requestQuery(String hostname, String path, Socket sock) { + public static void requestQuery(String hostname, String path, Socket sock) { StringBuffer req = new StringBuffer("GET "); req.append("/"); req.append(path); req.append(" HTTP/1.1\r\nHost:"); req.append(hostname); req.append("\r\n\r\n"); - System.printString("req : " + req + "\n"); sock.write(req.toString().getBytes()); } - public void readResponse(Query q, Socket sock) { + public static void readResponse(LocalQuery lq, Socket sock) { // state 0 - nothing // state 1 - \r // state 2 - \r\n @@ -114,7 +130,7 @@ public class QueryThread extends Task { return; else { String curr=(new String(buffer)).subString(0,numchars); - q.response.append(curr); + lq.response.append(curr); } } } @@ -122,48 +138,38 @@ public class QueryThread extends Task { public void done(Object obj) { doneList.push(obj); -// System.printString("Size of todoList : " + todoList.size() + "\n"); -// Thread.sleep(5000000); } - public void processPage(Query q, QueryList doneList) { + public static QueryQueue processPage(LocalQuery lq,int maxSearchDepth) { int index = 0; String href = new String("href=\""); - String searchstr = q.response.toLocalString(); + String searchstr = lq.response.toString(); + int depth; boolean cont = true; + QueryQueue toprocess = new QueryQueue(); + depth = lq.getDepth() + 1; + + int searchDepthCnt = 0; while(cont && (searchDepthCnt < maxSearchDepth)) { int mindex = searchstr.indexOf(href,index); if (mindex != -1) { int endquote = searchstr.indexOf('"', mindex+href.length()); if (endquote != -1) { String match = searchstr.subString(mindex+href.length(), endquote); - GlobalString gmatch; - GlobalString gmatch2; + String match2 = lq.makewebcanonical(match); + + if (match2 != null) { + LocalQuery newlq = new LocalQuery(lq.getHostName(match), lq.getPathName(match), depth); - atomic { - gmatch = global new GlobalString(match); - gmatch2 = q.makewebcanonical(gmatch); - } - if (gmatch2 != null && !doneList.checkQuery(gmatch2)) { -// doneList.push(gmatch2); - done(gmatch2); - if (depthCnt < maxDepth) { - Query newq; - System.printString("Depth : " + depthCnt + "\n"); - atomic { - newq = global new Query(q.getHostName(gmatch), q.getPathName(gmatch)); - todoList.push(newq); - System.printString("Size of todoList : " + todoList.size() + "\n"); - searchDepthCnt++; - } - } + toprocess.push(newlq); + searchDepthCnt++; } - index = endquote; + index = endquote; } else cont = false; } else cont = false; } - depthCnt++; - searchDepthCnt = 0; + + return toprocess; } } diff --git a/Robust/src/Benchmarks/Spider/dsm/Spider.java b/Robust/src/Benchmarks/Spider/dsm/Spider.java index 66504dc7..6163e1e0 100644 --- a/Robust/src/Benchmarks/Spider/dsm/Spider.java +++ b/Robust/src/Benchmarks/Spider/dsm/Spider.java @@ -1,39 +1,8 @@ public class Spider { - public static int[] getMID (int num_threads) { - int[] mid = new int[num_threads]; - - FileInputStream ifs = new FileInputStream("dstm.conf"); - String str; - String sub; - int fromIndex = 0; - int endIndex = 0; - int[] tmp = new int[4]; - - for (int i = 0; i < num_threads; i++) { - str = ifs.readLine(); - endIndex = str.indexOf('.', fromIndex); - sub = str.subString(fromIndex, endIndex); - - fromIndex = endIndex + 1; - endIndex = str.indexOf('.', fromIndex); - sub = str.subString(fromIndex, endIndex); - - fromIndex = endIndex + 1; - endIndex = str.indexOf('.', fromIndex); - sub = str.subString(fromIndex, endIndex); - - fromIndex = endIndex + 1; - sub = str.subString(fromIndex); - - fromIndex = 0; - } - return mid; - } - public static void main(String[] args) { - int NUM_THREADS = 3; - int depth = 5; - int searchDepth = 5; + int NUM_THREADS = 4; + int maxDepth = 5; + int searchDepth = 10; int i, j; Work[] works; QueryThread[] qt; @@ -43,13 +12,17 @@ public class Spider { GlobalString firstmachine; GlobalString firstpage; - int[] mid = getMID(NUM_THREADS); +// int[] mid = getMID(NUM_THREADS); + int mid[] = new int[NUM_THREADS]; +/* mid[0] = (128<<24)|(195<<16)|(180<<8)|21; //dc-4 + mid[1] = (128<<24)|(195<<16)|(180<<8)|24; //dc-5 + mid[2] = (128<<24)|(195<<16)|(180<<8)|26; //dc-6 + */ + mid[0] = (128<<24)|(195<<16)|(136<<8)|162; //dc-1 + mid[1] = (128<<24)|(195<<16)|(136<<8)|163; //dc-2 + mid[2] = (128<<24)|(195<<16)|(136<<8)|164; //dc-3 + mid[3] = (128<<24)|(195<<16)|(136<<8)|165; //dc-3 -/* int mid[] = new int[NUM_THREADS]; - mid[0] = (128<<24)|(195<<16)|(136<<8)|166; //dc-4 - mid[1] = (128<<24)|(195<<16)|(136<<8)|167; //dc-5 - mid[2] = (128<<24)|(195<<16)|(136<<8)|168; //dc-6 -*/ atomic { firstmachine = global new GlobalString(args[1]); firstpage = global new GlobalString(args[2]); @@ -58,14 +31,14 @@ public class Spider { qt = global new QueryThread[NUM_THREADS]; currentWorkList = global new Query[NUM_THREADS]; - Query firstquery = global new Query(firstmachine, firstpage); + Query firstquery = global new Query(firstmachine, firstpage, 0); Queue todoList = global new Queue(); + Queue doneList = global new Queue(); todoList.push(firstquery); - QueryList doneList = global new QueryList(); for (i = 0; i < NUM_THREADS; i++) { - qt[i] = global new QueryThread(todoList, doneList, depth, searchDepth); + qt[i] = global new QueryThread(todoList, doneList, maxDepth, searchDepth); works[i] = global new Work(qt[i], NUM_THREADS, i, currentWorkList); } } @@ -76,7 +49,7 @@ public class Spider { atomic { tmp = works[i]; } - tmp.start(mid[i]); + Thread.myStart(tmp, mid[i]); } for (i = 0; i < NUM_THREADS; i++) { @@ -85,9 +58,45 @@ public class Spider { } tmp.join(); } + } + + public static int[] getMID (int num_threads) { + int[] mid = new int[num_threads]; + + FileInputStream ifs = new FileInputStream("dstm.conf"); + String str; + String sub; + int fromIndex; + int endIndex; + double num; + + for (int i = 0; i < num_threads; i++) { + int power = 3 - i; + fromIndex = 0; + num = 0; + + str = ifs.readLine(); + + endIndex = str.indexOf('.', fromIndex); + sub = str.subString(fromIndex, endIndex); + num += (Integer.parseInt(sub) << 24); + + fromIndex = endIndex + 1; + endIndex = str.indexOf('.', fromIndex); + sub = str.subString(fromIndex, endIndex); + num += (Integer.parseInt(sub) << 16); + + fromIndex = endIndex + 1; + endIndex = str.indexOf('.', fromIndex); + sub = str.subString(fromIndex, endIndex); + num += (Integer.parseInt(sub) << 8); -// while(true) -// Thread.sleep(1000000); + fromIndex = endIndex + 1; + sub = str.subString(fromIndex); + num += Integer.parseInt(sub); + mid[i] = (int)num; + } + return mid; } } diff --git a/Robust/src/Benchmarks/Spider/dsm/dstm.conf b/Robust/src/Benchmarks/Spider/dsm/dstm.conf index 6b3f3e96..4a1050b0 100644 --- a/Robust/src/Benchmarks/Spider/dsm/dstm.conf +++ b/Robust/src/Benchmarks/Spider/dsm/dstm.conf @@ -1,3 +1,9 @@ -128.195.136.166 -128.195.136.167 -128.195.136.168 +#128.195.180.21 +#128.195.180.24 +#128.195.180.26 +128.195.136.162 +128.195.136.163 +128.195.136.164 +128.195.136.165 +#128.195.136.166 +#128.195.136.167 diff --git a/Robust/src/Benchmarks/Spider/dsm/makefile b/Robust/src/Benchmarks/Spider/dsm/makefile index 0e561faf..35b0e708 100644 --- a/Robust/src/Benchmarks/Spider/dsm/makefile +++ b/Robust/src/Benchmarks/Spider/dsm/makefile @@ -2,7 +2,7 @@ MAINCLASS=Spider SUBCLASS=Query SRC1=${MAINCLASS}.java SRC2=${SUBCLASS}.java -SRC3=${SUBCLASS}List.java +SRC3=${SUBCLASS}Queue.java SRC4=${SUBCLASS}Thread.java FLAGS= -dsm -32bit -nooptimize -debug -recovery -mainclass ${MAINCLASS} default: @@ -11,3 +11,8 @@ default: clean: rm -rf tmpbuilddirectory rm *.bin + rm *.php + rm *.css + rm www* + rm eee* + rm web* -- 2.34.1