From 9f2cd561286784fd000eb8a00f1f80db3185062c Mon Sep 17 00:00:00 2001 From: frosty Date: Fri, 27 Feb 2026 18:32:23 -0500 Subject: added proxying --- example-config.ini | 15 ++- src/Config.c | 18 +++- src/Config.h | 5 + src/Infobox/Dictionary.c | 3 + src/Infobox/Wikipedia.c | 3 + src/Main.c | 34 ++++++- src/Proxy/Proxy.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++ src/Proxy/Proxy.h | 31 ++++++ src/Routes/ImageProxy.c | 3 +- src/Routes/Images.c | 5 +- src/Routes/Search.c | 16 ++- src/Scraping/Scraping.c | 36 ++++++- src/Scraping/Scraping.h | 7 +- 13 files changed, 416 insertions(+), 17 deletions(-) create mode 100644 src/Proxy/Proxy.c create mode 100644 src/Proxy/Proxy.h diff --git a/example-config.ini b/example-config.ini index 8905049..26385ac 100644 --- a/example-config.ini +++ b/example-config.ini @@ -1,3 +1,16 @@ [server] host = 0.0.0.0 -port = 8000 \ No newline at end of file +port = 8000 + +[proxy] +# Single proxy (comment out to use list_file instead) +#proxy = "socks5://127.0.0.1:9050" + +# Or use a proxy list file (one proxy per line) +#list_file = proxies.txt + +#max_retries = 3 + +# Randomize proxy credentials for each request +#randomize_username = true +#randomize_password = true diff --git a/src/Config.c b/src/Config.c index 4a93980..5cea724 100644 --- a/src/Config.c +++ b/src/Config.c @@ -46,11 +46,13 @@ int load_config(const char *filename, Config *config) { } char *value_end = value + strlen(value) - 1; - while (value_end > value && (*value_end == ' ' || *value_end == '\t')) { + while (value_end > value && (*value_end == ' ' || *value_end == '\t' || *value_end == '"' || *value_end == '\'')) { *value_end = '\0'; value_end--; } + while (*value == '"' || *value == '\'') value++; + if (strcmp(section, "server") == 0) { if (strcmp(key, "host") == 0) { strncpy(config->host, value, sizeof(config->host) - 1); @@ -58,6 +60,20 @@ int load_config(const char *filename, Config *config) { } else if (strcmp(key, "port") == 0) { config->port = atoi(value); } + } else if (strcmp(section, "proxy") == 0) { + if (strcmp(key, "proxy") == 0) { + strncpy(config->proxy, value, sizeof(config->proxy) - 1); + config->proxy[sizeof(config->proxy) - 1] = '\0'; + } else if (strcmp(key, "list_file") == 0) { + strncpy(config->proxy_list_file, value, sizeof(config->proxy_list_file) - 1); + config->proxy_list_file[sizeof(config->proxy_list_file) - 1] = '\0'; + } else if (strcmp(key, "max_retries") == 0) { + config->max_proxy_retries = atoi(value); + } else if (strcmp(key, "randomize_username") == 0) { + config->randomize_username = atoi(value); + } else if (strcmp(key, "randomize_password") == 0) { + config->randomize_password = atoi(value); + } } } } diff --git a/src/Config.h b/src/Config.h index 384ed94..d14dd6b 100644 --- a/src/Config.h +++ b/src/Config.h @@ -4,6 +4,11 @@ typedef struct { char host[256]; int port; + char proxy[256]; + char proxy_list_file[256]; + int max_proxy_retries; + int randomize_username; + int randomize_password; } Config; int load_config(const char *filename, Config *config); diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c index a835899..ca4e5cd 100644 --- a/src/Infobox/Dictionary.c +++ b/src/Infobox/Dictionary.c @@ -1,4 +1,6 @@ #include "Dictionary.h" +#include "../Proxy/Proxy.h" +#include "../Scraping/Scraping.h" #include #include #include @@ -216,6 +218,7 @@ InfoBox fetch_dictionary_data(const char *query) { curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + apply_proxy_settings(curl); if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c index ed4645f..dff26f6 100644 --- a/src/Infobox/Wikipedia.c +++ b/src/Infobox/Wikipedia.c @@ -1,4 +1,6 @@ #include "Wikipedia.h" +#include "../Proxy/Proxy.h" +#include "../Scraping/Scraping.h" #include #include #include @@ -123,6 +125,7 @@ InfoBox fetch_wiki_data(char *api_url) { WikiWriteMemoryCallback); curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + apply_proxy_settings(curl_handle); res = curl_easy_perform(curl_handle); diff --git a/src/Main.c b/src/Main.c index b1fd113..4475c5d 100644 --- a/src/Main.c +++ b/src/Main.c @@ -1,10 +1,13 @@ #include #include #include +#include #include #include #include "Config.h" +#include "Proxy/Proxy.h" +#include "Scraping/Scraping.h" #include "Routes/Home.h" #include "Routes/Images.h" #include "Routes/ImageProxy.h" @@ -17,17 +20,45 @@ int handle_opensearch(UrlParams *params) { } int main() { + sigset_t mask; + sigemptyset(&mask); + sigaddset(&mask, SIGPIPE); + pthread_sigmask(SIG_BLOCK, &mask, NULL); + LIBXML_TEST_VERSION xmlInitParser(); curl_global_init(CURL_GLOBAL_DEFAULT); - Config config = {.host = "0.0.0.0", .port = 5000}; + Config config = { + .host = "0.0.0.0", + .port = 5000, + .proxy = "", + .proxy_list_file = "", + .max_proxy_retries = 3, + .randomize_username = 0, + .randomize_password = 0 + }; if (load_config("config.ini", &config) != 0) { fprintf(stderr, "Warning: Could not load config file, using defaults\n"); } + if (config.proxy_list_file[0] != '\0') { + if (load_proxy_list(config.proxy_list_file) < 0) { + fprintf(stderr, "Warning: Failed to load proxy list, continuing without proxies\n"); + } + } + + max_proxy_retries = config.max_proxy_retries; + set_proxy_config(config.proxy, config.randomize_username, config.randomize_password); + + if (proxy_url[0] != '\0') { + fprintf(stderr, "Using proxy: %s\n", proxy_url); + } else if (proxy_count > 0) { + fprintf(stderr, "Using %d proxies from %s\n", proxy_count, config.proxy_list_file); + } + set_handler("/", home_handler); set_handler("/opensearch.xml", handle_opensearch); set_handler("/search", results_handler); @@ -47,5 +78,6 @@ int main() { curl_global_cleanup(); xmlCleanupParser(); + free_proxy_list(); return EXIT_SUCCESS; } \ No newline at end of file diff --git a/src/Proxy/Proxy.c b/src/Proxy/Proxy.c new file mode 100644 index 0000000..939aea0 --- /dev/null +++ b/src/Proxy/Proxy.c @@ -0,0 +1,257 @@ +#include "Proxy.h" +#include +#include +#include +#include +#include + +Proxy *proxy_list = NULL; +int proxy_count = 0; +int max_proxy_retries = 3; +int randomize_username = 0; +int randomize_password = 0; +char proxy_url[512] = {0}; +static pthread_mutex_t proxy_mutex = PTHREAD_MUTEX_INITIALIZER; + +static const char RAND_CHARS[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +static void generate_random_string(char *buf, size_t len) { + for (size_t i = 0; i < len - 1; i++) { + buf[i] = RAND_CHARS[rand() % (sizeof(RAND_CHARS) - 1)]; + } + buf[len - 1] = '\0'; +} + +void set_proxy_config(const char *proxy_str, int rand_user, int rand_pass) { + if (proxy_str && proxy_str[0]) { + strncpy(proxy_url, proxy_str, sizeof(proxy_url) - 1); + proxy_url[sizeof(proxy_url) - 1] = '\0'; + } + randomize_username = rand_user; + randomize_password = rand_pass; +} + +static Proxy parse_proxy_line(const char *line) { + Proxy proxy = {.type = PROXY_SOCKS5, .port = 0, .username[0] = '\0', .password[0] = '\0', .failures = 0}; + const char *host_start = NULL; + const char *port_start = NULL; + + size_t len = strlen(line); + if (len == 0) return proxy; + + if (strncmp(line, "http://", 7) == 0) { + proxy.type = PROXY_HTTP; + host_start = line + 7; + } else if (strncmp(line, "socks5://", 9) == 0) { + proxy.type = PROXY_SOCKS5; + host_start = line + 9; + } else if (strncmp(line, "socks4://", 9) == 0) { + proxy.type = PROXY_SOCKS4; + host_start = line + 9; + } else { + host_start = line; + } + + const char *at = strchr(host_start, '@'); + if (at) { + char cred_buf[128]; + size_t cred_len = at - host_start; + if (cred_len >= sizeof(cred_buf)) cred_len = sizeof(cred_buf) - 1; + strncpy(cred_buf, host_start, cred_len); + cred_buf[cred_len] = '\0'; + + char *colon = strchr(cred_buf, ':'); + if (colon) { + size_t user_len = colon - cred_buf; + if (user_len >= sizeof(proxy.username)) user_len = sizeof(proxy.username) - 1; + strncpy(proxy.username, cred_buf, user_len); + proxy.username[user_len] = '\0'; + strncpy(proxy.password, colon + 1, sizeof(proxy.password) - 1); + proxy.password[sizeof(proxy.password) - 1] = '\0'; + } + host_start = at + 1; + } + + port_start = strchr(host_start, ':'); + if (port_start) { + char host_buf[256]; + size_t host_len = port_start - host_start; + if (host_len >= sizeof(host_buf)) host_len = sizeof(host_buf) - 1; + strncpy(host_buf, host_start, host_len); + host_buf[host_len] = '\0'; + snprintf(proxy.host, sizeof(proxy.host), "%.*s", (int)host_len, host_buf); + proxy.port = atoi(port_start + 1); + } else { + snprintf(proxy.host, sizeof(proxy.host), "%s", host_start); + } + + return proxy; +} + +int load_proxy_list(const char *filename) { + if (!filename || filename[0] == '\0') { + return 0; + } + + pthread_mutex_lock(&proxy_mutex); + + if (proxy_list) { + free(proxy_list); + proxy_list = NULL; + } + proxy_count = 0; + + FILE *file = fopen(filename, "r"); + if (!file) { + pthread_mutex_unlock(&proxy_mutex); + fprintf(stderr, "[WARN] Could not open proxy list file: %s\n", filename); + return -1; + } + + int capacity = 16; + proxy_list = (Proxy *)malloc(capacity * sizeof(Proxy)); + if (!proxy_list) { + fclose(file); + return -1; + } + proxy_count = 0; + + char line[512]; + while (fgets(line, sizeof(line), file)) { + line[strcspn(line, "\r\n")] = 0; + + if (line[0] == '\0' || line[0] == '#') { + continue; + } + + char *p = line; + while (*p == ' ' || *p == '\t') p++; + + char *end = p + strlen(p) - 1; + while (end > p && (*end == ' ' || *end == '\t')) { + *end = '\0'; + end--; + } + + if (p[0] == '\0') continue; + + Proxy proxy = parse_proxy_line(p); + if (proxy.port == 0) { + continue; + } + + if (proxy_count >= capacity) { + capacity *= 2; + Proxy *new_list = (Proxy *)realloc(proxy_list, capacity * sizeof(Proxy)); + if (!new_list) { + free(proxy_list); + proxy_list = NULL; + proxy_count = 0; + fclose(file); + pthread_mutex_unlock(&proxy_mutex); + return -1; + } + proxy_list = new_list; + } + + proxy_list[proxy_count++] = proxy; + } + + fclose(file); + fprintf(stderr, "[INFO] Loaded %d proxies from %s\n", proxy_count, filename); + pthread_mutex_unlock(&proxy_mutex); + return proxy_count; +} + +void free_proxy_list(void) { + pthread_mutex_lock(&proxy_mutex); + if (proxy_list) { + free(proxy_list); + proxy_list = NULL; + } + proxy_count = 0; + pthread_mutex_unlock(&proxy_mutex); +} + +Proxy *get_random_proxy(void) { + pthread_mutex_lock(&proxy_mutex); + if (proxy_count == 0) { + pthread_mutex_unlock(&proxy_mutex); + return NULL; + } + + int start = rand() % proxy_count; + int checked = 0; + Proxy *selected = NULL; + + while (checked < proxy_count) { + int idx = (start + checked) % proxy_count; + if (proxy_list[idx].failures < max_proxy_retries) { + selected = &proxy_list[idx]; + break; + } + checked++; + } + + if (!selected) { + for (int i = 0; i < proxy_count; i++) { + proxy_list[i].failures = 0; + } + selected = &proxy_list[rand() % proxy_count]; + } + + pthread_mutex_unlock(&proxy_mutex); + return selected; +} + +void record_proxy_failure(Proxy *proxy) { + if (!proxy) return; + pthread_mutex_lock(&proxy_mutex); + proxy->failures++; + pthread_mutex_unlock(&proxy_mutex); +} + +void apply_proxy_settings(CURL *curl) { + if (proxy_url[0] != '\0') { + curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url); + if (strncmp(proxy_url, "socks5://", 9) == 0) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); + } else if (strncmp(proxy_url, "socks4://", 9) == 0) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + } else { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + } + + if (randomize_username || randomize_password) { + char userpwd[256]; + char username[32] = {0}; + char password[32] = {0}; + + if (randomize_username) generate_random_string(username, sizeof(username)); + if (randomize_password) generate_random_string(password, sizeof(password)); + + snprintf(userpwd, sizeof(userpwd), "%s:%s", username, password); + curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); + } + } else if (proxy_count > 0) { + Proxy *proxy = get_random_proxy(); + if (proxy) { + char proxy_url_buf[512]; + snprintf(proxy_url_buf, sizeof(proxy_url_buf), "%s:%d", proxy->host, proxy->port); + curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url_buf); + if (proxy->type == PROXY_HTTP) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP); + } else if (proxy->type == PROXY_SOCKS4) { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A); + } else { + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5); + } + + if (proxy->username[0] != '\0' || proxy->password[0] != '\0') { + char userpwd[128]; + snprintf(userpwd, sizeof(userpwd), "%s:%s", proxy->username, proxy->password); + curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd); + } + } + } +} diff --git a/src/Proxy/Proxy.h b/src/Proxy/Proxy.h new file mode 100644 index 0000000..d9a438d --- /dev/null +++ b/src/Proxy/Proxy.h @@ -0,0 +1,31 @@ +#ifndef PROXY_H +#define PROXY_H + +#include + +typedef enum { PROXY_HTTP, PROXY_SOCKS4, PROXY_SOCKS5 } ProxyType; + +typedef struct { + ProxyType type; + char host[256]; + int port; + char username[64]; + char password[64]; + int failures; +} Proxy; + +extern Proxy *proxy_list; +extern int proxy_count; +extern int max_proxy_retries; +extern int randomize_username; +extern int randomize_password; +extern char proxy_url[512]; + +int load_proxy_list(const char *filename); +void free_proxy_list(void); +Proxy *get_random_proxy(void); +void record_proxy_failure(Proxy *proxy); +void apply_proxy_settings(CURL *curl); +void set_proxy_config(const char *proxy_str, int rand_user, int rand_pass); + +#endif diff --git a/src/Routes/ImageProxy.c b/src/Routes/ImageProxy.c index 9dadef7..5141cd5 100644 --- a/src/Routes/ImageProxy.c +++ b/src/Routes/ImageProxy.c @@ -1,5 +1,5 @@ #include "ImageProxy.h" - +#include "../Proxy/Proxy.h" #include #include #include @@ -118,6 +118,7 @@ int image_proxy_handler(UrlParams *params) { curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L); + apply_proxy_settings(curl); CURLcode res = curl_easy_perform(curl); diff --git a/src/Routes/Images.c b/src/Routes/Images.c index a4770c5..e96d6fd 100644 --- a/src/Routes/Images.c +++ b/src/Routes/Images.c @@ -1,5 +1,7 @@ #include "Images.h" #include "../Utility/Unescape.h" +#include "../Proxy/Proxy.h" +#include "../Scraping/Scraping.h" #include #include @@ -50,6 +52,7 @@ static char *fetch_images_html(const char *url) { "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L); + apply_proxy_settings(curl_handle); CURLcode res = curl_easy_perform(curl_handle); if (res != CURLE_OK) { @@ -247,7 +250,7 @@ int images_handler(UrlParams *params) { } image_matrix[image_count] = malloc(sizeof(char *) * 4); - image_matrix[image_count][0] = proxy_url ? proxy_url : strdup((char *)iurl); + image_matrix[image_count][0] = proxy_url ? strdup(proxy_url) : strdup((char *)iurl); image_matrix[image_count][1] = strdup(title ? (char *)title : "Image"); image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#"); image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#"); diff --git a/src/Routes/Search.c b/src/Routes/Search.c index 060a222..19419db 100644 --- a/src/Routes/Search.c +++ b/src/Routes/Search.c @@ -88,10 +88,10 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection, (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1)); (*collection)[current_count] = (char **)malloc(sizeof(char *) * 4); - (*collection)[current_count][0] = infobox->title; - (*collection)[current_count][1] = infobox->thumbnail_url; - (*collection)[current_count][2] = infobox->extract; - (*collection)[current_count][3] = infobox->url; + (*collection)[current_count][0] = infobox->title ? strdup(infobox->title) : NULL; + (*collection)[current_count][1] = infobox->thumbnail_url ? strdup(infobox->thumbnail_url) : NULL; + (*collection)[current_count][2] = infobox->extract ? strdup(infobox->extract) : NULL; + (*collection)[current_count][3] = infobox->url ? strdup(infobox->url) : NULL; (*inner_counts)[current_count] = 4; return current_count + 1; @@ -151,6 +151,10 @@ int results_handler(UrlParams *params) { jobs[i].max_results = 10; jobs[i].results_count = 0; jobs[i].page = page; + jobs[i].handle = NULL; + jobs[i].response.memory = NULL; + jobs[i].response.size = 0; + jobs[i].response.capacity = 0; } scrape_engines_parallel(jobs, ENGINE_COUNT); @@ -185,6 +189,10 @@ int results_handler(UrlParams *params) { if (infobox_count > 0) { context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix, infobox_count, infobox_inner_counts); + for (int i = 0; i < infobox_count; i++) { + for (int j = 0; j < 4; j++) free(infobox_matrix[i][j]); + free(infobox_matrix[i]); + } free(infobox_matrix); free(infobox_inner_counts); } diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c index 42e05d6..5b1b5d6 100644 --- a/src/Scraping/Scraping.c +++ b/src/Scraping/Scraping.c @@ -1,4 +1,5 @@ #include "Scraping.h" +#include "../Proxy/Proxy.h" #include "../Utility/Unescape.h" #include #include @@ -329,9 +330,14 @@ static void configure_curl_handle(CURL *curl, const char *full_url, curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + + apply_proxy_settings(curl); } int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + int retries = 0; + +retry: CURLM *multi_handle = curl_multi_init(); if (!multi_handle) { return -1; @@ -339,6 +345,15 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { for (int i = 0; i < num_jobs; i++) { ScrapeJob *job = &jobs[i]; + + if (job->handle) { + curl_easy_cleanup(job->handle); + job->handle = NULL; + } + if (job->response.memory) { + free(job->response.memory); + } + job->handle = curl_easy_init(); if (!job->handle) { continue; @@ -406,7 +421,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { CURL *handle = msg->easy_handle; for (int i = 0; i < num_jobs; i++) { - if (jobs[i].handle == handle) { + if (jobs[i].handle && jobs[i].handle == handle) { ScrapeJob *job = &jobs[i]; long response_code; @@ -431,8 +446,10 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { if (headers) curl_slist_free_all(headers); free(job->response.memory); + job->response.memory = NULL; curl_multi_remove_handle(multi_handle, handle); - curl_easy_cleanup(handle); + if (handle) curl_easy_cleanup(handle); + job->handle = NULL; break; } } @@ -440,6 +457,21 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { } curl_multi_cleanup(multi_handle); + + if (retries < max_proxy_retries && proxy_count > 0) { + int any_failed = 0; + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].results_count == 0 && jobs[i].response.size == 0) { + any_failed = 1; + break; + } + } + if (any_failed) { + retries++; + goto retry; + } + } + return 0; } diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h index d8a3b13..0865825 100644 --- a/src/Scraping/Scraping.h +++ b/src/Scraping/Scraping.h @@ -4,11 +4,6 @@ #include #include -#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) -#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) -#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) -#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) - typedef struct { char *url; char *title; @@ -55,4 +50,4 @@ int scrape_engine(const SearchEngine *engine, const char *query, int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); -#endif \ No newline at end of file +#endif -- cgit v1.2.3