aboutsummaryrefslogtreecommitdiff
path: root/src/Routes
diff options
context:
space:
mode:
authorfrosty <frosty@illegalfirearms.store>2026-01-22 12:57:27 -0500
committerfrosty <frosty@illegalfirearms.store>2026-01-22 12:57:27 -0500
commit47f16db1909d185f7a6c5987226f64f0e2788262 (patch)
tree4b658904c885959cee41dbcde6bbf89190ae1797 /src/Routes
parent1a4b78d6938a70503300c4d4b8e1c4c271e1531c (diff)
scraping now more efficient blehhh
Diffstat (limited to 'src/Routes')
-rw-r--r--src/Routes/Search.c73
1 files changed, 35 insertions, 38 deletions
diff --git a/src/Routes/Search.c b/src/Routes/Search.c
index 110e6f7..fcddfc2 100644
--- a/src/Routes/Search.c
+++ b/src/Routes/Search.c
@@ -12,19 +12,6 @@
#include <time.h>
typedef struct {
- const SearchEngine *engine;
- const char *query;
- SearchResult *results;
- int count;
-} EngineThreadData;
-
-static void *scrape_thread_func(void *arg) {
- EngineThreadData *data = (EngineThreadData *)arg;
- data->count = scrape_engine(data->engine, data->query, &data->results, 10);
- return NULL;
-}
-
-typedef struct {
const char *query;
InfoBox result;
int success;
@@ -109,7 +96,6 @@ int results_handler(UrlParams *params) {
}
char *encoded_query = strdup(raw_query);
-
char *display_query = url_decode_query(raw_query);
LOG_INFO("Processing search request for query: '%s'", display_query);
context_set(&ctx, "query", display_query);
@@ -130,18 +116,20 @@ int results_handler(UrlParams *params) {
pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data);
pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data);
- pthread_t engine_tids[ENGINE_COUNT];
- EngineThreadData engine_data[ENGINE_COUNT];
+ ScrapeJob jobs[ENGINE_COUNT];
+ SearchResult *all_results[ENGINE_COUNT];
for (int i = 0; i < ENGINE_COUNT; i++) {
- engine_data[i].engine = &ENGINE_REGISTRY[i];
- engine_data[i].query = encoded_query;
-
- engine_data[i].results = NULL;
- engine_data[i].count = 0;
- pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]);
+ all_results[i] = NULL;
+ jobs[i].engine = &ENGINE_REGISTRY[i];
+ jobs[i].query = encoded_query;
+ jobs[i].out_results = &all_results[i];
+ jobs[i].max_results = 10;
+ jobs[i].results_count = 0;
}
+ scrape_engines_parallel(jobs, ENGINE_COUNT);
+
pthread_join(wiki_tid, NULL);
pthread_join(calc_tid, NULL);
@@ -150,12 +138,14 @@ int results_handler(UrlParams *params) {
int infobox_count = 0;
if (calc_data.success) {
+ LOG_INFO("Calculator result available, adding to InfoBox");
infobox_count =
add_infobox_to_collection(&calc_data.result, &infobox_matrix,
&infobox_inner_counts, infobox_count);
}
if (wiki_data.success) {
+ LOG_INFO("Wikipedia result available, adding to InfoBox");
infobox_count =
add_infobox_to_collection(&wiki_data.result, &infobox_matrix,
&infobox_inner_counts, infobox_count);
@@ -172,8 +162,9 @@ int results_handler(UrlParams *params) {
int total_results = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
- pthread_join(engine_tids[i], NULL);
- total_results += engine_data[i].count;
+ total_results += jobs[i].results_count;
+ LOG_INFO("Engine %s returned %d results",
+ jobs[i].engine->name, jobs[i].results_count);
}
if (total_results > 0) {
@@ -183,8 +174,8 @@ int results_handler(UrlParams *params) {
int unique_count = 0;
for (int i = 0; i < ENGINE_COUNT; i++) {
- for (int j = 0; j < engine_data[i].count; j++) {
- char *raw_url = engine_data[i].results[j].url;
+ for (int j = 0; j < jobs[i].results_count; j++) {
+ char *raw_url = all_results[i][j].url;
char *clean_url = unescape_search_url(raw_url);
char *display_url = clean_url ? clean_url : raw_url;
@@ -198,9 +189,9 @@ int results_handler(UrlParams *params) {
if (is_duplicate) {
if (clean_url) free(clean_url);
- free(engine_data[i].results[j].url);
- free(engine_data[i].results[j].title);
- free(engine_data[i].results[j].snippet);
+ free(all_results[i][j].url);
+ free(all_results[i][j].title);
+ free(all_results[i][j].snippet);
continue;
}
@@ -211,27 +202,32 @@ int results_handler(UrlParams *params) {
results_matrix[unique_count][0] = strdup(display_url);
results_matrix[unique_count][1] = strdup(pretty_url);
results_matrix[unique_count][2] =
- engine_data[i].results[j].title
- ? strdup(engine_data[i].results[j].title)
+ all_results[i][j].title
+ ? strdup(all_results[i][j].title)
: strdup("Untitled");
results_matrix[unique_count][3] =
- engine_data[i].results[j].snippet
- ? strdup(engine_data[i].results[j].snippet)
+ all_results[i][j].snippet
+ ? strdup(all_results[i][j].snippet)
: strdup("");
results_inner_counts[unique_count] = 4;
free(pretty_url);
- free(engine_data[i].results[j].url);
- free(engine_data[i].results[j].title);
- free(engine_data[i].results[j].snippet);
+ free(all_results[i][j].url);
+ free(all_results[i][j].title);
+ free(all_results[i][j].snippet);
if (clean_url) free(clean_url);
unique_count++;
}
- free(engine_data[i].results);
+
+ if (all_results[i]) {
+ free(all_results[i]);
+ }
}
+ LOG_INFO("Deduplicated to %d unique results", unique_count);
+
context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count,
results_inner_counts);
@@ -250,6 +246,7 @@ int results_handler(UrlParams *params) {
free(results_matrix);
free(results_inner_counts);
} else {
+ LOG_WARN("No search results found for query: '%s'", display_query);
char *html = render_template("results.html", &ctx);
if (html) {
send_response(html);
@@ -270,4 +267,4 @@ int results_handler(UrlParams *params) {
free_context(&ctx);
return 0;
-}
+} \ No newline at end of file