/* * The Alecto crawler and storage engine is released under GPL 2.0. * A license should be distributed with this code. This statement * must be released with any use of this code. * * Copyright David Kellogg, 2008 */ #include #include #include #include #include #include #include #include #include #include #include "crawler.h" #include "base.h" #include "re.h" using namespace std; static void *myrealloc(void *ptr, size_t size) { if(ptr) return realloc(ptr, size); else return malloc(size); } Download::Download(string u) { parent = "__ROOT__"; url = u; lnk = parse_link(url); page.memory = NULL; page.size = 0; } Download::Download(string u, string p) { parent = p; url = u; //printf("Loading url: %s\n",url.c_str()); lnk = parse_link(url); page.memory = NULL; page.size = 0; } Download::~Download() { if(page.memory) { free(page.memory); } } string Download::get_base() { string base = lnk.fqdn + lnk.path+ "/"; return base; } string Download::get_url() { return url; } Link Download::parse_link(string url) { Link ln; ln.proto = "http"; ln.query_string = ""; string pattern = "^([^/]+)(/.*)?(/[^/]*)"; regex res = pcre_match(pattern, url.c_str()); if(res.matches.size() > 0) { ln.fqdn = res.matches[1].sub; ln.path = res.matches[2].sub; ln.file = res.matches[3].sub; } return ln; } vector Download::get_links() { return links; } void Download::put_down_size(double size) { down_size = size; } void Download::put_down_speed(double speed) { down_speed = speed; } void Download::put_http_code(long code) { http_code = code; } long Download::get_http_code() { return http_code; } char* Download::get_page() { return page.memory; } void Download::put_content_type(char* type) { content_type = type; } void Download::put_links(vector l) { links = l; } CURL* Download::get_context() { return ctx; } int Download::init_context(CURL* context) { ctx = context; if( NULL == ctx ){ printf("Unable to initialize cURL interface"); return( ERROR_CURL_INIT ) ; } curl_easy_setopt( ctx, CURLOPT_URL, get_url().c_str() ) ; curl_easy_setopt( ctx, CURLOPT_NOPROGRESS , OPTION_TRUE ) ; curl_easy_setopt( ctx, CURLOPT_USERAGENT , "Alecto http://davidkellogg.com/" ) ; curl_easy_setopt( ctx, CURLOPT_WRITEFUNCTION, Crawler::WritePageCallback); curl_easy_setopt( ctx, CURLOPT_WRITEDATA, (void *)&page); return 0; } CURLM* Crawler::mult; Crawler::Crawler() { curl_global_init( CURL_GLOBAL_ALL ) ; conf = new Config(); } Crawler::~Crawler() { for( unsigned int i=0; i < downs.size(); i++) { CURL* next_context = downs[i]->get_context(); curl_multi_remove_handle(mult, next_context); curl_easy_cleanup(next_context); } curl_global_cleanup() ; } size_t Crawler::WritePageCallback(void *ptr, size_t size, size_t nmemb, void *data) { size_t realsize = size * nmemb; struct PageStruct *mem = (struct PageStruct *)data; //printf("received size %d\n",(int)realsize); mem->memory = (char *)myrealloc(mem->memory, mem->size + realsize + 1); if (mem->memory) { //printf("mem is %s\n", mem->memory); memcpy(&(mem->memory[mem->size]), ptr, realsize); mem->size += realsize; mem->memory[mem->size] = 0; } return realsize; } void Crawler::add_url(string u, string parent) { //printf("adding url %s\n", u.c_str()); Download* d = new Download(u, parent); downs.push_back(d); } void Crawler::remove_url(string url) { vector::iterator it; it = downs.begin(); for(it = downs.begin(); it != downs.end(); ) { if(! strcmp((*it)->get_url().c_str(), url.c_str())) { // if(! (*it)->get_url().compare(url)) { delete(*it); it = downs.erase(it); continue; } it++; } } char* Crawler::get_page(int num) { if( downs[num]->get_page()) { return downs[num]->get_page(); } else { return ""; } } int Crawler::run_crawler( ){ // DO NOT create an FD for a host within 10 seconds of the last // DO NOT create an FD for a host without a robots file // DO NOT create an FD for an excluded directory //printf("Runninng crawler\n"); vector::iterator mydowns; for(mydowns=downs.begin();mydowns != downs.end(); ) { Download* d = *mydowns; // EXCLUSIONS bool has_robots = get_has_robots(d->get_url()); if(!has_robots && ! strstr(d->get_url().c_str(), "robots.txt")) { //if(!has_robots && d->get_url().find("robots.txt") != d->get_url().end()) { //printf("You must download robots for %s\n", d->get_url().c_str()); mydowns = downs.erase(mydowns); continue; } bool passes_robots = get_passes_robots(d->get_url()); if(!passes_robots) { //printf("Robots excluded for directory %s\n", d->get_url().c_str()); mydowns = downs.erase(mydowns); continue; } bool is_polite = get_is_polite(d->get_url()); if(!is_polite) { //printf("Politeness violated for %s\n", d->get_url().c_str()); mydowns = downs.erase(mydowns); delayed_pages.push_back(d->get_url()); continue; } // END EXCLUSIONS CURL* new_context = curl_easy_init(); int res = d->init_context(new_context); if(res != 0) { // FAILURE return res; } ++mydowns; } if(downs.size() == 0) { return 0; } int still_running; CURL* mult = curl_multi_init(); for( unsigned int i=0; i < downs.size(); i++) { //printf("adding handle\n"); curl_multi_add_handle( mult, downs[i]->get_context() ); } int running_handles; curl_multi_perform( mult, &running_handles ); while(CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mult, &still_running)) {;} while(still_running) { struct timeval timeout; int rc; fd_set fdread; fd_set fdwrite; fd_set fdexcep; int maxfd; FD_ZERO(&fdread); FD_ZERO(&fdwrite); FD_ZERO(&fdexcep); timeout.tv_sec = 1; timeout.tv_usec = 0; curl_multi_fdset(mult, &fdread, &fdwrite, &fdexcep, &maxfd); rc = select(maxfd+1, &fdread, &fdwrite, &fdexcep, &timeout); switch(rc) { case -1: /* select error */ break; case 0: printf("timeout!\n"); default: while(CURLM_CALL_MULTI_PERFORM == curl_multi_perform(mult, &still_running)); break; } } double statDouble ; long statLong ; char* statString = NULL ; for(unsigned int i=0;iget_context(); if( CURLE_OK == curl_easy_getinfo( this_context, CURLINFO_HTTP_CODE , &statLong ) ){ downs[i]->put_http_code(statLong); } if( CURLE_OK == curl_easy_getinfo( this_context, CURLINFO_CONTENT_TYPE , &statString ) ){ downs[i]->put_content_type(statString); } /* */ if( CURLE_OK == curl_easy_getinfo( this_context, CURLINFO_SIZE_DOWNLOAD , &statDouble ) ){ downs[i]->put_down_size(statDouble); } if( CURLE_OK == curl_easy_getinfo( this_context, CURLINFO_SPEED_DOWNLOAD , &statDouble ) ){ downs[i]->put_down_speed(statDouble); } } return( 0 ) ; } void Crawler::extract_links(Download* down, string page, string base) { string pattern = "a href=\"([^\"#]*)"; vector tokens; tokenize(page, tokens, "<"); vector new_links; for(unsigned int i=0; i 0) { string link = res.matches[1].sub; link = normalize_link(link, base); if(conf->is_wanted_uri(link)) { //printf("pushing %s\n",link.c_str()); new_links.push_back(link); } } } down->put_links(new_links); } string Crawler::normalize_link(string uri, string base) { //printf("normalizing: %s base: %s\n", uri.c_str(), base.c_str()); // It might already be normalized //regex is_full_url = pcre_match("^http://(.*)", uri.c_str()); // Starts with . regex res_dot = pcre_match("^\\./(.*)", uri.c_str()); // Starts with http:// regex res_http = pcre_match("^[^:/]+://(.+)", uri.c_str()); if(res_dot.matches.size() > 0) { uri = base + res_dot.matches[1].sub; //printf("found dot\n"); } else if(res_http.matches.size() > 0) { // printf("found http: %s\n", res_http); uri = res_http.matches[1].sub; // noop } else { //printf("found relative\n"); uri = base + uri; } //printf("before .. removal %s\n", uri.c_str()); uri = pcre_replace("/[^/\\.]+/\\.\\./", uri, "/"); //printf(" after .. removal %s\n", uri.c_str()); return uri; } bool Crawler::get_has_robots(string url) { Host* host = get_host(url); if(host == NULL) { return false; } return host->robots_loaded; } bool Crawler::get_passes_robots(string url) { Host* h = get_host(url); string dir = Host::get_dir(url); if(h != NULL && h->banned_dirs.size() > 0) { vector::iterator it; for(it=h->banned_dirs.begin(); it !=h->banned_dirs.end();it++) { string banned_dir = *it; if(dir.find(banned_dir) != string::npos) { return false; } } } return true; } /* * * Crawler::get_is_polite * Args: * url to check hostname politeness * Returns: * true if we proceed with download * false if politeness is violated * */ bool Crawler::get_is_polite(string url) { // cannot enter same host 2 times in 10 seconds Host* h = get_host(url); int polite_period = 10; time_t now = time(NULL); if(h != NULL) { if(h->last_download_time > now - polite_period) { return false; } h->last_download_time = now; } return true; } /* * Crawler::read_robots * * Creates Host with robots filled in if necessary * Arguments: * host fqdn of page * page robots.txt file to remember * */ void Crawler::read_robots(string host, string page) { vector tokens; tokenize(page, tokens, "\n"); string hostname = host; Host* h = new Host(); h->robots_loaded = true; for(unsigned int i=0; i 0) { h->banned_dirs.push_back(res.matches[1].sub); } } hosts[hostname] = h; } Host* Crawler::get_host(string url) { string hostname = Host::get_hostname(url); Host* host = NULL; if(hosts[hostname]) { host = hosts[hostname]; } return host; } Download* Crawler::get_down(int down_number) { return downs[down_number]; } size_t Crawler::get_downs_size() { return downs.size(); } vector Crawler::get_delayed_pages() { return delayed_pages; } map Crawler::to_map( vector v) { map m; vector::iterator it; for(it = v.begin();it != v.end(); it++) { m[*it] = true; } return m; } Host::Host() { last_download_time = 0; } Host::~Host() { } string Host::get_hostname(string url) { //printf("get hostname url: %s\n", url.c_str()); regex res = pcre_match("^([^\\/]*)", url); if(res.matches.size() > 0) { //printf("get hostname match: %s\n", res.matches[1].sub.c_str() ); return res.matches[1].sub; } else { return ""; } } string Host::get_dir(string url) { regex res = pcre_match("^[^\\/]*(.*)(\\/.*)?", url); //regex res; pcre_match("^[^\\/]*(.*)(\\/.*)?", url, res); if(res.matches.size() > 0) { return res.matches[1].sub; } else { return ""; } } Config::Config() { avoid_prefixes.push_back("ftp://"); avoid_prefixes.push_back("news://"); avoid_prefixes.push_back("rsync://"); avoid_suffixes.push_back("gz"); avoid_suffixes.push_back("asc"); avoid_suffixes.push_back("md5"); allowed_hosts["localhost"] = true; } Config::~Config() {} vector Config::get_avoid_prefixes() { return avoid_prefixes; } vector Config::get_avoid_suffixes() { return avoid_suffixes; } bool Config::is_allowed_host(string hostname) { if(allowed_hosts[hostname] == true) { return true; } return false; } bool Config::is_wanted_uri(string uri) { //return true; bool is_wanted = true; string hostname = Host::get_hostname(uri.c_str()); if(! is_allowed_host(hostname) ) { //printf("unallowed host: %s\n", hostname.c_str()); is_wanted = false; } for(unsigned int i=0;i 0) { //printf("unallowed suffixes: %s\n",uri.c_str()); is_wanted = false; } } for(unsigned int i=0;i 0) { //printf("unallowed prefixes: %s\n",uri.c_str()); is_wanted = false; } } return is_wanted; }