aboutsummaryrefslogtreecommitdiff
path: root/src/Scraping/ScrapingHttp.c
diff options
context:
space:
mode:
authorfrosty <gabriel@bwaaa.monster>2026-03-17 13:51:12 -0400
committerfrosty <gabriel@bwaaa.monster>2026-03-17 13:51:12 -0400
commitc7b95d05715a45c7790aa8a7e4b0b61bac2e4208 (patch)
treeb0b511b4cc6610949cdde5a6a220724a31c617fd /src/Scraping/ScrapingHttp.c
parent8c6632502ff992e80051910451421c55894ed9d8 (diff)
downloadomnisearch-c7b95d05715a45c7790aa8a7e4b0b61bac2e4208.tar.gz
fix: refactored scraping components
Diffstat (limited to 'src/Scraping/ScrapingHttp.c')
-rw-r--r--src/Scraping/ScrapingHttp.c109
1 files changed, 109 insertions, 0 deletions
diff --git a/src/Scraping/ScrapingHttp.c b/src/Scraping/ScrapingHttp.c
new file mode 100644
index 0000000..1a6a292
--- /dev/null
+++ b/src/Scraping/ScrapingHttp.c
@@ -0,0 +1,109 @@
+#include "../Proxy/Proxy.h"
+#include "Config.h"
+#include "Scraping.h"
+#include <curl/curl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define HTTP_DELAY_MIN_US 100000
+#define HTTP_DELAY_RANGE_US 100000
+
+static const char *USER_AGENTS[] = {
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
+ "Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
+ "Firefox/121.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
+
+#define USER_AGENT_COUNT (sizeof(USER_AGENTS) / sizeof(USER_AGENTS[0]))
+
+size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ MemoryBuffer *mem = (MemoryBuffer *)userp;
+
+ if (mem->size + realsize + 1 > mem->capacity) {
+ size_t new_cap =
+ mem->capacity == 0 ? INITIAL_BUFFER_SIZE : mem->capacity * 2;
+ while (new_cap < mem->size + realsize + 1)
+ new_cap *= 2;
+
+ char *ptr = (char *)realloc(mem->memory, new_cap);
+ if (!ptr)
+ return 0;
+ mem->memory = ptr;
+ mem->capacity = new_cap;
+ }
+
+ memcpy(&(mem->memory[mem->size]), contents, realsize);
+ mem->size += realsize;
+ mem->memory[mem->size] = 0;
+
+ return realsize;
+}
+
+const char *get_random_user_agent(void) {
+ return USER_AGENTS[rand() % USER_AGENT_COUNT];
+}
+
+void configure_curl_handle(CURL *curl, const char *full_url,
+ MemoryBuffer *chunk, struct curl_slist *headers) {
+ curl_easy_setopt(curl, CURLOPT_URL, full_url);
+ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_memory_callback);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
+ curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
+
+ curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+ curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
+ curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+ curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+
+ apply_proxy_settings(curl);
+}
+
+char *build_search_url(const char *base_url, const char *page_param,
+ int page_multiplier, int page_base,
+ const char *encoded_query, int page) {
+ int page_value = (page < 1 ? 1 : page - 1) * page_multiplier + page_base;
+ char *url = malloc(BUFFER_SIZE_LARGE);
+ if (!url)
+ return NULL;
+ snprintf(url, BUFFER_SIZE_LARGE, "%s%s&%s=%d", base_url, encoded_query,
+ page_param, page_value);
+ return url;
+}
+
+struct curl_slist *build_request_headers(const char *host_header,
+ const char *referer) {
+ struct curl_slist *headers = NULL;
+ char host_buf[BUFFER_SIZE_MEDIUM], ref_buf[BUFFER_SIZE_MEDIUM];
+
+ snprintf(host_buf, sizeof(host_buf), "Host: %s", host_header);
+ snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", referer);
+
+ headers = curl_slist_append(headers, host_buf);
+ headers = curl_slist_append(headers, ref_buf);
+ headers = curl_slist_append(
+ headers,
+ "Accept: "
+ "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
+ headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
+ headers = curl_slist_append(headers, "DNT: 1");
+
+ return headers;
+}
+
+void http_delay(void) {
+ usleep(HTTP_DELAY_MIN_US + (rand() % HTTP_DELAY_RANGE_US));
+}