1 public class QueryTask extends Task {
4 DistributedHashMap results;
5 DistributedHashMap visitedList;
7 GlobalString workingURL;
9 public QueryTask(Queue todoList, DistributedHashMap visitedList, int maxDepth, DistributedHashMap results) {
10 this.todoList = todoList;
11 this.visitedList = visitedList;
12 this.maxDepth = maxDepth;
13 this.results = results;
14 toprocess = global new Queue();
17 public void execute() {
22 depth = ((GlobalQuery)myWork).getDepth();
27 /* global variables */
37 gq = (GlobalQuery)myWork;
38 hostname = new String(GlobalString.toLocalCharArray(gq.getHostName()));
39 path = new String(GlobalString.toLocalCharArray(gq.getPath()));
41 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
44 workingURL = global new GlobalString(gsb.toGlobalString());
47 lq = new LocalQuery(hostname, path, depth);
49 System.printString("["+lq.getDepth()+"] ");
50 System.printString("Processing - Hostname : ");
51 System.printString(hostname);
52 System.printString(", Path : ");
53 System.printString(path);
54 System.printString("\n");
56 if (isDocument(path)) {
60 Socket s = new Socket();
62 if(s.connect(hostname, 80) == -1) {
66 requestQuery(hostname, path, s);
69 if ((title = grabTitle(lq)) != null) {
71 gTitle = global new GlobalString(title);
74 toprocess = processPage(lq);
81 public static boolean isDocument(String str) {
82 int index = str.lastindexOf('.');
85 if ((str.subString(index+1)).equals("pdf")) return true;
86 else if ((str.subString(index+1)).equals("ps")) return true;
87 else if ((str.subString(index+1)).equals("ppt")) return true;
88 else if ((str.subString(index+1)).equals("pptx")) return true;
89 else if ((str.subString(index+1)).equals("jpg")) return true;
90 else if ((str.subString(index+1)).equals("mp3")) return true;
91 else if ((str.subString(index+1)).equals("wmv")) return true;
92 else if ((str.subString(index+1)).equals("doc")) return true;
93 else if ((str.subString(index+1)).equals("docx")) return true;
94 else if ((str.subString(index+1)).equals("mov")) return true;
95 else if ((str.subString(index+1)).equals("flv")) return true;
101 public void done(Object obj) {
102 if ((gTitle != null) && (gTitle.length() > 0)) {
106 while(!toprocess.isEmpty()) {
107 GlobalQuery q = (GlobalQuery)toprocess.pop();
109 GlobalString hostname = global new GlobalString(q.getHostName());
110 GlobalString path = global new GlobalString(q.getPath());
112 GlobalStringBuffer gsb = global new GlobalStringBuffer(hostname);
116 if (!visitedList.containsKey(gsb.toGlobalString())) {
119 GlobalString str = global new GlobalString("1");
120 visitedList.put(gsb.toGlobalString(), str);
125 public static String grabTitle(LocalQuery lq) {
126 String sBrace = new String("<");
127 String strTitle = new String("title>");
128 String searchstr = lq.response.toString();
137 for (i = 0; i < searchstr.length(); i++) {
138 if (searchstr.charAt(i) == '<') {
140 if (searchstr.length() > (i+strTitle.length())) {
141 tmp = searchstr.subString(i, i+strTitle.length());
142 if (tmp.equalsIgnoreCase("title>")) {
143 mindex = i + tmp.length();
144 for (j = mindex; j < searchstr.length(); j++) {
145 if (searchstr.charAt(j) == '<') {
147 tmp = searchstr.subString(j, j+strTitle.length()+1);
148 if (tmp.equalsIgnoreCase("/title>")) {
160 title = searchstr.subString(mindex, endquote);
161 if (Character.isWhitespace(title.charAt(0))){
163 while (Character.isWhitespace(title.charAt(mindex++)));
165 if (mindex >= title.length()) return null;
166 title = new String(title.subString(mindex));
169 if (Character.isWhitespace(title.charAt(title.length()-1))) {
170 endquote=title.length()-1;
171 while (Character.isWhitespace(title.charAt(endquote--)));
173 if (mindex >= endquote) return null;
174 title = new String(title.subString(0, endquote));
177 if (isErrorPage(title)) {
185 public static boolean isErrorPage(String str) {
186 if (str.equals("301 Moved Permanently"))
188 else if (str.equals("302 Found"))
190 else if (str.equals("404 Not Found"))
192 else if (str.equals("403 Forbidden"))
194 else if (str.equals("404 File Not Found"))
200 public static void requestQuery(String hostname, String path, Socket sock) {
201 StringBuffer req = new StringBuffer("GET ");
204 req.append(" HTTP/1.0\r\nHost: ");
205 req.append(hostname);
206 req.append("\r\n\r\n");
207 sock.write(req.toString().getBytes());
210 public static void readResponse(LocalQuery lq, Socket sock) {
215 // state 4 - \r\n\r\n
216 byte[] buffer = new byte[1024];
220 numchars = sock.read(buffer);
222 String curr = (new String(buffer)).subString(0, numchars);
224 lq.response.append(curr);
225 buffer = new byte[1024];
226 } while(numchars > 0);
229 public void processList() {
231 GlobalString token = null;
235 while (endquote != -1) {
236 endquote = gTitle.indexOf(' ', mindex);
238 if (endquote != -1) {
239 token = gTitle.subString(mindex, endquote);
240 mindex = endquote + 1;
244 token = refine(token);
247 token = gTitle.subString(mindex);
248 token = refine(token);
251 Queue q = (Queue)results.get(token);
253 q = global new Queue();
256 results.put(token, q);
260 public boolean filter(GlobalString str) {
261 if (str.equals("of")) return true;
262 else if (str.equals("for")) return true;
263 else if (str.equals("a")) return true;
264 else if (str.equals("an")) return true;
265 else if (str.equals("the")) return true;
266 else if (str.equals("at")) return true;
267 else if (str.equals("and")) return true;
268 else if (str.equals("or")) return true;
269 else if (str.equals("but")) return true;
270 else if (str.equals("to")) return true;
271 else if (str.equals("The")) return true;
272 else if (str.length() == 1) {
273 if (str.charAt(0) == '.') return true;
274 else if (str.charAt(0) == '.') return true;
275 else if (str.charAt(0) == '-') return true;
276 else if (str.charAt(0) == '=') return true;
277 else if (str.charAt(0) == '_') return true;
278 else if (str.charAt(0) == ':') return true;
279 else if (str.charAt(0) == ';') return true;
280 else if (str.charAt(0) == '\'') return true;
281 else if (str.charAt(0) == '\"') return true;
282 else if (str.charAt(0) == '|') return true;
283 else if (str.charAt(0) == '@') return true;
284 else if (str.charAt(0) == '&') return true;
285 else if (str.charAt(0) == ' ') return true;
290 public GlobalString refine(GlobalString str) {
291 str = refinePrefix(str);
292 str = refinePostfix(str);
296 public GlobalString refinePrefix(GlobalString str) {
297 if (str.charAt(0) == '&') { // &
298 return str.subString(1);
300 else if (str.charAt(0) == '/') { // &
301 return str.subString(1);
306 public GlobalString refinePostfix(GlobalString str) {
307 if (str.charAt(str.length()-1) == ',') { // ,
308 return str.subString(0, str.length()-1);
310 else if (str.charAt(str.length()-1) == ':') { // :
311 return str.subString(0, str.length()-1);
313 else if (str.charAt(str.length()-1) == ';') { // ;
314 return str.subString(0, str.length()-1);
316 else if (str.charAt(str.length()-1) == '!') { // !
317 return str.subString(0, str.length()-1);
319 else if (str.charAt(str.length()-1) == 's') { // 's
320 if (str.charAt(str.length()-2) == '\'')
321 return str.subString(0, str.length()-2);
323 else if (str.charAt(str.length()-1) == '-') {
324 int index = str.length()-2;
325 while (Character.isWhitespace(str.charAt(index--)));
326 return str.subString(0, index+2);
331 public static Queue processPage(LocalQuery lq) {
333 String href = new String("href=\"");
334 String searchstr = lq.response.toString();
339 depth = lq.getDepth() + 1;
341 toprocess = global new Queue();
343 int mindex = searchstr.indexOf(href,index);
345 int endquote = searchstr.indexOf('"', mindex+href.length());
346 if (endquote != -1) {
347 String match = searchstr.subString(mindex+href.length(), endquote);
348 String match2 = lq.makewebcanonical(match);
350 GlobalString ghostname;
353 ghostname = global new GlobalString(lq.getHostName(match));
354 gpath = global new GlobalString(lq.getPathName(match));
356 if (match2 != null) {
357 GlobalQuery gq = global new GlobalQuery(ghostname, gpath, depth);