aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping/Scraping.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/Scraping/Scraping.c')
-rw-r--r--src/Scraping/Scraping.c152
1 files changed, 126 insertions, 26 deletions
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index 0709de4..ff8dec8 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -6,8 +6,98 @@
#include <libxml/HTMLparser.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <time.h>
+static int response_contains(const char *response, const char *needle) {
+ return response && needle && strstr(response, needle) != NULL;
+}
+
+static int is_startpage_job(const ScrapeJob *job) {
+ return job && job->engine && strcmp(job->engine->name, "Startpage") == 0;
+}
+
+static int response_is_startpage_captcha(const ScrapeJob *job,
+ const char *response) {
+ if (!is_startpage_job(job))
+ return 0;
+
+ return response_contains(response, "<title>Startpage Captcha</title>") ||
+ response_contains(response, "Startpage Captcha") ||
+ response_contains(response, "/static-pages-assets/page-data/captcha/");
+}
+
+static int response_looks_like_results_page(const ScrapeJob *job,
+ const char *response) {
+ if (!job || !job->engine || !response)
+ return 0;
+
+ if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) {
+ return response_contains(response, "result-link") ||
+ response_contains(response, "result-snippet");
+ }
+
+ if (strcmp(job->engine->name, "Startpage") == 0) {
+ return response_contains(response, "<title>Startpage Search Results</title>") ||
+ response_contains(response, "class=\"w-gl") ||
+ response_contains(response, "data-testid=\"gl-title-link\"");
+ }
+
+ if (strcmp(job->engine->name, "Yahoo") == 0) {
+ return response_contains(response, "algo-sr") ||
+ response_contains(response, "compTitle") ||
+ response_contains(response, "compText");
+ }
+
+ return 0;
+}
+
+static void classify_job_response(ScrapeJob *job, const char *response,
+ size_t response_size) {
+ job->results_count = 0;
+
+ if (!response || response_size == 0) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ if (response_is_startpage_captcha(job, response)) {
+ job->status = SCRAPE_STATUS_BLOCKED;
+ return;
+ }
+
+ xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+ HTML_PARSE_NOWARNING);
+
+ if (!doc) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ job->results_count =
+ job->engine->parser(job->engine->name, doc, job->out_results,
+ job->max_results);
+ xmlFreeDoc(doc);
+
+ if (job->results_count > 0) {
+ job->status = SCRAPE_STATUS_OK;
+ return;
+ }
+
+ if (job->http_status >= 400) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ if (response_looks_like_results_page(job, response)) {
+ job->status = SCRAPE_STATUS_PARSE_MISMATCH;
+ return;
+ }
+
+ job->status = SCRAPE_STATUS_EMPTY;
+}
+
int check_cache_for_job(ScrapeJob *job) {
if (get_cache_ttl_search() <= 0)
return 0;
@@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) {
if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
- xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
- HTML_PARSE_NOWARNING);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
+ classify_job_response(job, cached_data, cached_size);
+
+ if (job->status == SCRAPE_STATUS_BLOCKED) {
+ free(cached_data);
+ free(key);
+ return 0;
}
+
free(cached_data);
free(key);
@@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) {
void parse_and_cache_response(ScrapeJob *job) {
if (job->response.size == 0) {
job->results_count = 0;
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return;
}
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (key && get_cache_ttl_search() > 0)
- cache_set(key, job->response.memory, job->response.size);
- free(key);
-
- xmlDocPtr doc = htmlReadMemory(
- job->response.memory, job->response.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+ classify_job_response(job, job->response.memory, job->response.size);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
- } else {
- job->results_count = 0;
+ if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) {
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
+ if (key && get_cache_ttl_search() > 0)
+ cache_set(key, job->response.memory, job->response.size);
+ free(key);
}
}
@@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
}
void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
+ curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status);
+
if (msg->data.result == CURLE_OK)
parse_and_cache_response(job);
- else
+ else {
job->results_count = 0;
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ }
cleanup_job_handle(job, handle);
}
@@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
if (job->response.memory)
free(job->response.memory);
+ job->results_count = 0;
+ job->http_status = 0;
+ job->status = SCRAPE_STATUS_PENDING;
+
if (check_cache_for_job(job)) {
job->results_count = job->results_count > 0 ? job->results_count : 0;
return 0;
}
char *encoded_query = curl_easy_escape(NULL, job->query, 0);
- if (!encoded_query)
+ if (!encoded_query) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
+ }
char *full_url =
build_search_url(job->engine->base_url, job->engine->page_param,
@@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
encoded_query, job->page);
free(encoded_query);
- if (!full_url)
+ if (!full_url) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
+ }
job->handle = curl_easy_init();
if (!job->handle) {
free(full_url);
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
}
@@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) {
return 0;
for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
+ if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR ||
+ jobs[i].status == SCRAPE_STATUS_BLOCKED)
return 1;
}
return 0;
@@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
int retries = 0;
retry:
+ ;
CURLM *multi_handle = curl_multi_init();
if (!multi_handle)
return -1;
@@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query,
.out_results = out_results,
.max_results = max_results,
.results_count = 0,
- .page = 1};
+ .page = 1,
+ .http_status = 0,
+ .status = SCRAPE_STATUS_PENDING};
scrape_engines_parallel(&job, 1);
return job.results_count;