aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorElse <else@localhost>2026-03-18 15:13:18 +0100
committerfrosty <gabriel@bwaaa.monster>2026-03-18 11:23:51 -0400
commit44b6a9b7603e88c7d8f7964effb0b408ce5d1e68 (patch)
tree63b2611543a53fbd731a4878e2d7a9b1bdef8cf9 /src
parentefb9f737fa9f7064601cef71afee7dd74300e908 (diff)
downloadomnisearch-44b6a9b7603e88c7d8f7964effb0b408ce5d1e68.tar.gz
Surface blocked search engine responses
Diffstat (limited to 'src')
-rw-r--r--src/Routes/Search.c114
-rw-r--r--src/Scraping/Scraping.c152
-rw-r--r--src/Scraping/Scraping.h11
3 files changed, 251 insertions, 26 deletions
diff --git a/src/Routes/Search.c b/src/Routes/Search.c
index b9851d7..61465f1 100644
--- a/src/Routes/Search.c
+++ b/src/Routes/Search.c
@@ -155,6 +155,67 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
return current_count + 1;
}
+static int add_warning_to_collection(const char *engine_name,
+ const char *warning_message,
+ char ****collection, int **inner_counts,
+ int current_count) {
+ char ***new_collection =
+ (char ***)malloc(sizeof(char **) * (current_count + 1));
+ int *new_inner_counts =
+ (int *)malloc(sizeof(int) * (current_count + 1));
+
+ if (!new_collection || !new_inner_counts) {
+ free(new_collection);
+ free(new_inner_counts);
+ return current_count;
+ }
+
+ if (*collection && current_count > 0) {
+ memcpy(new_collection, *collection, sizeof(char **) * current_count);
+ }
+ if (*inner_counts && current_count > 0) {
+ memcpy(new_inner_counts, *inner_counts, sizeof(int) * current_count);
+ }
+
+ free(*collection);
+ free(*inner_counts);
+
+ *collection = new_collection;
+ *inner_counts = new_inner_counts;
+
+ (*collection)[current_count] = (char **)malloc(sizeof(char *) * 2);
+ if (!(*collection)[current_count])
+ return current_count;
+
+ (*collection)[current_count][0] = strdup(engine_name ? engine_name : "");
+ (*collection)[current_count][1] =
+ strdup(warning_message ? warning_message : "");
+
+ if (!(*collection)[current_count][0] || !(*collection)[current_count][1]) {
+ free((*collection)[current_count][0]);
+ free((*collection)[current_count][1]);
+ free((*collection)[current_count]);
+ return current_count;
+ }
+
+ (*inner_counts)[current_count] = 2;
+ return current_count + 1;
+}
+
+static const char *warning_message_for_job(const ScrapeJob *job) {
+ switch (job->status) {
+ case SCRAPE_STATUS_FETCH_ERROR:
+ return "request failed before OmniSearch could read search results.";
+ case SCRAPE_STATUS_PARSE_MISMATCH:
+ return "returned search results in a format OmniSearch could not parse.";
+ case SCRAPE_STATUS_BLOCKED:
+ return "returned a captcha or another blocking page instead of search "
+ "results.";
+ default:
+ return NULL;
+ }
+}
+
int results_handler(UrlParams *params) {
TemplateContext ctx = new_context();
char *raw_query = "";
@@ -224,6 +285,8 @@ int results_handler(UrlParams *params) {
jobs[i].response.memory = NULL;
jobs[i].response.size = 0;
jobs[i].response.capacity = 0;
+ jobs[i].http_status = 0;
+ jobs[i].status = SCRAPE_STATUS_PENDING;
}
scrape_engines_parallel(jobs, ENGINE_COUNT);
@@ -260,6 +323,44 @@ int results_handler(UrlParams *params) {
free(infobox_inner_counts);
}
+ int warning_count = 0;
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ if (warning_message_for_job(&jobs[i]))
+ warning_count++;
+ }
+
+ if (warning_count > 0) {
+ char ***warning_matrix = NULL;
+ int *warning_inner_counts = NULL;
+ int warning_index = 0;
+
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ const char *warning_message = warning_message_for_job(&jobs[i]);
+ if (!warning_message)
+ continue;
+
+ warning_index = add_warning_to_collection(
+ jobs[i].engine->name, warning_message, &warning_matrix,
+ &warning_inner_counts, warning_index);
+ }
+
+ if (warning_index > 0) {
+ context_set_array_of_arrays(&ctx, "engine_warnings", warning_matrix,
+ warning_index, warning_inner_counts);
+ }
+
+ if (warning_matrix) {
+ for (int i = 0; i < warning_index; i++) {
+ free(warning_matrix[i][0]);
+ free(warning_matrix[i][1]);
+ free(warning_matrix[i]);
+ }
+ free(warning_matrix);
+ }
+ if (warning_inner_counts)
+ free(warning_inner_counts);
+ }
+
int total_results = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
total_results += jobs[i].results_count;
@@ -281,6 +382,15 @@ int results_handler(UrlParams *params) {
send_response(html);
free(html);
}
+ for (int i = 0; i < ENGINE_COUNT; i++)
+ free(all_results[i]);
+ if (page == 1) {
+ for (int i = 0; i < HANDLER_COUNT; i++) {
+ if (infobox_data[i].success) {
+ free_infobox(&infobox_data[i].result);
+ }
+ }
+ }
free_context(&ctx);
return 0;
}
@@ -368,6 +478,10 @@ int results_handler(UrlParams *params) {
send_response(html);
free(html);
}
+
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ free(all_results[i]);
+ }
}
if (page == 1) {
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
index 0709de4..ff8dec8 100644
--- a/src/Scraping/Scraping.c
+++ b/src/Scraping/Scraping.c
@@ -6,8 +6,98 @@
#include <libxml/HTMLparser.h>
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <time.h>
+static int response_contains(const char *response, const char *needle) {
+ return response && needle && strstr(response, needle) != NULL;
+}
+
+static int is_startpage_job(const ScrapeJob *job) {
+ return job && job->engine && strcmp(job->engine->name, "Startpage") == 0;
+}
+
+static int response_is_startpage_captcha(const ScrapeJob *job,
+ const char *response) {
+ if (!is_startpage_job(job))
+ return 0;
+
+ return response_contains(response, "<title>Startpage Captcha</title>") ||
+ response_contains(response, "Startpage Captcha") ||
+ response_contains(response, "/static-pages-assets/page-data/captcha/");
+}
+
+static int response_looks_like_results_page(const ScrapeJob *job,
+ const char *response) {
+ if (!job || !job->engine || !response)
+ return 0;
+
+ if (strcmp(job->engine->name, "DuckDuckGo Lite") == 0) {
+ return response_contains(response, "result-link") ||
+ response_contains(response, "result-snippet");
+ }
+
+ if (strcmp(job->engine->name, "Startpage") == 0) {
+ return response_contains(response, "<title>Startpage Search Results</title>") ||
+ response_contains(response, "class=\"w-gl") ||
+ response_contains(response, "data-testid=\"gl-title-link\"");
+ }
+
+ if (strcmp(job->engine->name, "Yahoo") == 0) {
+ return response_contains(response, "algo-sr") ||
+ response_contains(response, "compTitle") ||
+ response_contains(response, "compText");
+ }
+
+ return 0;
+}
+
+static void classify_job_response(ScrapeJob *job, const char *response,
+ size_t response_size) {
+ job->results_count = 0;
+
+ if (!response || response_size == 0) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ if (response_is_startpage_captcha(job, response)) {
+ job->status = SCRAPE_STATUS_BLOCKED;
+ return;
+ }
+
+ xmlDocPtr doc = htmlReadMemory(response, response_size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
+ HTML_PARSE_NOWARNING);
+
+ if (!doc) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ job->results_count =
+ job->engine->parser(job->engine->name, doc, job->out_results,
+ job->max_results);
+ xmlFreeDoc(doc);
+
+ if (job->results_count > 0) {
+ job->status = SCRAPE_STATUS_OK;
+ return;
+ }
+
+ if (job->http_status >= 400) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ return;
+ }
+
+ if (response_looks_like_results_page(job, response)) {
+ job->status = SCRAPE_STATUS_PARSE_MISMATCH;
+ return;
+ }
+
+ job->status = SCRAPE_STATUS_EMPTY;
+}
+
int check_cache_for_job(ScrapeJob *job) {
if (get_cache_ttl_search() <= 0)
return 0;
@@ -22,14 +112,14 @@ int check_cache_for_job(ScrapeJob *job) {
if (cache_get(key, (time_t)get_cache_ttl_search(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
- xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
- HTML_PARSE_NOWARNING);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
+ classify_job_response(job, cached_data, cached_size);
+
+ if (job->status == SCRAPE_STATUS_BLOCKED) {
+ free(cached_data);
+ free(key);
+ return 0;
}
+
free(cached_data);
free(key);
@@ -46,24 +136,17 @@ int check_cache_for_job(ScrapeJob *job) {
void parse_and_cache_response(ScrapeJob *job) {
if (job->response.size == 0) {
job->results_count = 0;
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return;
}
- char *key = cache_compute_key(job->query, job->page, job->engine->name);
- if (key && get_cache_ttl_search() > 0)
- cache_set(key, job->response.memory, job->response.size);
- free(key);
-
- xmlDocPtr doc = htmlReadMemory(
- job->response.memory, job->response.size, NULL, NULL,
- HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+ classify_job_response(job, job->response.memory, job->response.size);
- if (doc) {
- job->results_count = job->engine->parser(
- job->engine->name, doc, job->out_results, job->max_results);
- xmlFreeDoc(doc);
- } else {
- job->results_count = 0;
+ if (job->status == SCRAPE_STATUS_OK || job->status == SCRAPE_STATUS_EMPTY) {
+ char *key = cache_compute_key(job->query, job->page, job->engine->name);
+ if (key && get_cache_ttl_search() > 0)
+ cache_set(key, job->response.memory, job->response.size);
+ free(key);
}
}
@@ -78,10 +161,14 @@ void cleanup_job_handle(ScrapeJob *job, CURL *handle) {
}
void process_response(ScrapeJob *job, CURL *handle, CURLMsg *msg) {
+ curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &job->http_status);
+
if (msg->data.result == CURLE_OK)
parse_and_cache_response(job);
- else
+ else {
job->results_count = 0;
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
+ }
cleanup_job_handle(job, handle);
}
@@ -92,14 +179,20 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
if (job->response.memory)
free(job->response.memory);
+ job->results_count = 0;
+ job->http_status = 0;
+ job->status = SCRAPE_STATUS_PENDING;
+
if (check_cache_for_job(job)) {
job->results_count = job->results_count > 0 ? job->results_count : 0;
return 0;
}
char *encoded_query = curl_easy_escape(NULL, job->query, 0);
- if (!encoded_query)
+ if (!encoded_query) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
+ }
char *full_url =
build_search_url(job->engine->base_url, job->engine->page_param,
@@ -107,12 +200,15 @@ int setup_job(ScrapeJob *job, CURLM *multi_handle) {
encoded_query, job->page);
free(encoded_query);
- if (!full_url)
+ if (!full_url) {
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
+ }
job->handle = curl_easy_init();
if (!job->handle) {
free(full_url);
+ job->status = SCRAPE_STATUS_FETCH_ERROR;
return -1;
}
@@ -160,7 +256,8 @@ int should_retry(ScrapeJob *jobs, int num_jobs) {
return 0;
for (int i = 0; i < num_jobs; i++) {
- if (jobs[i].results_count == 0 && jobs[i].response.size == 0)
+ if (jobs[i].status == SCRAPE_STATUS_FETCH_ERROR ||
+ jobs[i].status == SCRAPE_STATUS_BLOCKED)
return 1;
}
return 0;
@@ -170,6 +267,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
int retries = 0;
retry:
+ ;
CURLM *multi_handle = curl_multi_init();
if (!multi_handle)
return -1;
@@ -213,7 +311,9 @@ int scrape_engine(const SearchEngine *engine, const char *query,
.out_results = out_results,
.max_results = max_results,
.results_count = 0,
- .page = 1};
+ .page = 1,
+ .http_status = 0,
+ .status = SCRAPE_STATUS_PENDING};
scrape_engines_parallel(&job, 1);
return job.results_count;
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
index 1439118..014285f 100644
--- a/src/Scraping/Scraping.h
+++ b/src/Scraping/Scraping.h
@@ -32,6 +32,15 @@ typedef struct {
size_t capacity;
} MemoryBuffer;
+typedef enum {
+ SCRAPE_STATUS_PENDING,
+ SCRAPE_STATUS_OK,
+ SCRAPE_STATUS_EMPTY,
+ SCRAPE_STATUS_FETCH_ERROR,
+ SCRAPE_STATUS_PARSE_MISMATCH,
+ SCRAPE_STATUS_BLOCKED,
+} ScrapeStatus;
+
typedef struct {
const SearchEngine *engine;
char *query;
@@ -41,6 +50,8 @@ typedef struct {
CURL *handle;
MemoryBuffer response;
int results_count;
+ long http_status;
+ ScrapeStatus status;
} ScrapeJob;
extern const SearchEngine ENGINE_REGISTRY[];