diff options
Diffstat (limited to 'src/Scraping/Scraping.c')
| -rw-r--r-- | src/Scraping/Scraping.c | 152 |
1 files changed, 126 insertions, 26 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 0709de4..ff8dec8 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -6,8 +6,98 @@ #include <libxml/HTMLparser.h> #include <stdio.h> #include <stdlib.h> +#include <string.h> #include <time.h> +static int response_contains(const char *response, const char *needle) { + return response && needle && strstr(response, needle) != NULL; +} + +static int is_startpage_job(const ScrapeJob *job) { + return job && job->engine && strcmp(job->engine->name, "Startpage") == 0; +} + +static int response_is_startpage_captcha(const ScrapeJob *job, + const char *response) { + if (!is_startpage_job(job)) + return 0; + + return response_contains(response, "<title>Startpage Captcha</title>") || + response_contains(response, "Startpage Captcha") || + response_contains(response, "/static-pages-assets/page-data/captcha/"); +} + +static int response_looks_like_results_page(const ScrapeJob *job, + const char *response) { + if (!job || !job->engine || !response) + return 0; + + if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) { + return response_contains(response, "result-link") || + response_contains(response, "result-snippet"); + } + + if (strcmp(job->engine->name, "Startpage") == 0) { + return response_contains(response, "<title>Startpage Search Results</title>") || + response_contains(response, "class=\"w-gl") || + response_contains(response, "data-testid=\"gl-title-link\""); + } + + if (strcmp(job->engine->name, "Yahoo") == 0) { + return response_contains(response, "algo-sr") || + response_contains(response, "compTitle") || + response_contains(response, "compText"); + } + + return 0; +} + +static void classify_job_response(ScrapeJob *job, const char *response, + size_t response_size) { + job->results_count = 0; + + if (!response || response_size == 0) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + if (response_is_startpage_captcha(job, response)) { + job->status = SCRAPE_STATUS_BLOCKED; + return; + } + + xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING); + + if (!doc) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + job->results_count = + job->engine->parser(job->engine->name, doc, job->out_results, + job->max_results); + xmlFreeDoc(doc); + + if (job->results_count > 0) { + job->status = SCRAPE_STATUS_OK; + return; + } + + if (job->http_status >= 400) { + job->status = SCRAPE_STATUS_FETCH_ERROR; + return; + } + + if (response_looks_like_results_page(job, response)) { + job->status = SCRAPE_STATUS_PARSE_MISMATCH; + return; + } + + job->status = SCRAPE_STATUS_EMPTY; +} + int check_cache_for_job(ScrapeJob *job) { if (get_cache_ttl_search() <= 0) return 0; @@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) { if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data, &cached_size) == 0 && cached_data && cached_size > 0) { - xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | - HTML_PARSE_NOWARNING); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); + classify_job_response(job, cached_data, cached_size); + + if (job->status == SCRAPE_STATUS_BLOCKED) { + free(cached_data); + free(key); + return 0; } + free(cached_data); free(key); @@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) { void parse_and_cache_response(ScrapeJob *job) { if (job->response.size == 0) { job->results_count = 0; + job->status = SCRAPE_STATUS_FETCH_ERROR; return; } - char *key = cache_compute_key(job->query, job->page, job->engine->name); - if (key && get_cache_ttl_search() > 0) - cache_set(key, job->response.memory, job->response.size); - free(key); - - xmlDocPtr doc = htmlReadMemory( - job->response.memory, job->response.size, NULL, NULL, - HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + classify_job_response(job, job->response.memory, job->response.size); - if (doc) { - job->results_count = job->engine->parser( - job->engine->name, doc, job->out_results, job->max_results); - xmlFreeDoc(doc); - } else { - job->results_count = 0; + if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) { + char *key = cache_compute_key(job->query, job->page, job->engine->name); + if (key && get_cache_ttl_search() > 0) + cache_set(key, job->response.memory, job->response.size); + free(key); } } @@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) { } void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) { + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status); + if (msg->data.result == CURLE_OK) parse_and_cache_response(job); - else + else { job->results_count = 0; + job->status = SCRAPE_STATUS_FETCH_ERROR; + } cleanup_job_handle(job, handle); } @@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) { if (job->response.memory) free(job->response.memory); + job->results_count = 0; + job->http_status = 0; + job->status = SCRAPE_STATUS_PENDING; + if (check_cache_for_job(job)) { job->results_count = job->results_count > 0 ? job->results_count : 0; return 0; } char *encoded_query = curl_easy_escape(NULL, job->query, 0); - if (!encoded_query) + if (!encoded_query) { + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; + } char *full_url = build_search_url(job->engine->base_url, job->engine->page_param, @@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) { encoded_query, job->page); free(encoded_query); - if (!full_url) + if (!full_url) { + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; + } job->handle = curl_easy_init(); if (!job->handle) { free(full_url); + job->status = SCRAPE_STATUS_FETCH_ERROR; return -1; } @@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) { return 0; for (int i = 0; i < num_jobs; i++) { - if (jobs[i].results_count == 0 && jobs[i].response.size == 0) + if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR || + jobs[i].status == SCRAPE_STATUS_BLOCKED) return 1; } return 0; @@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { int retries = 0; retry: + ; CURLM *multi_handle = curl_multi_init(); if (!multi_handle) return -1; @@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query, .out_results = out_results, .max_results = max_results, .results_count = 0, - .page = 1}; + .page = 1, + .http_status = 0, + .status = SCRAPE_STATUS_PENDING}; scrape_engines_parallel(&job, 1); return job.results_count; |
