#include #include #include #include using namespace std; class Config; enum fcurl_type_e { CFTYPE_NONE=0, CFTYPE_FILE=1, CFTYPE_CURL=2 }; struct PageStruct { char* memory; size_t size; }; struct Link { string proto; string fqdn; string path; string file; string query_string; }; struct fcurl_data { enum fcurl_type_e type; union { CURL *curl; FILE *file; } handle; char *buffer; int buffer_len; unsigned int buffer_pos; int still_running; }; typedef struct fcurl_data URL_FILE; enum { ERROR_ARGS = 1 , ERROR_CURL_INIT = 2 }; enum { OPTION_FALSE = 0 , OPTION_TRUE = 1 }; enum { FLAG_DEFAULT = 0 }; class Download { private: CURL* ctx; vector links; string parent; string url; double down_size; double down_speed; long http_code; char* content_type; PageStruct page; public: Download( string url ); Download( string url, string parent ); ~Download(); vector get_links(); Link lnk; int init_context(CURL* context); Link parse_link(string url); string get_url(); void put_links(vector l); void put_down_size(double size); void put_down_speed(double speed); void put_http_code(long code); long get_http_code(); void put_content_type(char* content_type); char* get_page(); CURL* get_context(); string get_base(); }; class Host { public: Host(); ~Host(); bool robots_loaded; vector banned_dirs; time_t last_download_time; static string get_hostname(string url); static string get_dir(string url); }; class Crawler { private: bool get_has_robots(string url); bool get_passes_robots(string url); bool get_is_polite(string url); Host* get_host(string hostname); Config* conf; vector downs; vector delayed_pages; map hosts; public: Crawler(); ~Crawler(); void remove_url(string url); static size_t WritePageCallback(void *ptr, size_t size, size_t nmemb, void *data); char* get_page(int num); void add_url(string u, string parent); static CURLM* mult; void read_robots(string host, string page); int run_crawler( ); string normalize_link(string uri, string base); void extract_links(Download* down, string page, string base); Download* get_down(int down_number); size_t get_downs_size(); vector get_delayed_pages(); static map Crawler::to_map( vector v); }; class Config { private: vector avoid_prefixes; vector avoid_suffixes; map allowed_hosts; vector get_avoid_prefixes(); vector get_avoid_suffixes(); bool is_allowed_host(string hostname); public: Config(); ~Config(); bool is_wanted_uri(string uri); };