From db3fcee0edacd8f11b2a15a95b81f4e6419af2d6 Mon Sep 17 00:00:00 2001 From: anagarwa Date: Tue, 10 Jun 2025 17:06:25 +0530 Subject: [PATCH 1/2] fix: scrapping content in api --- packages/spacecat-shared-utils/src/index.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/packages/spacecat-shared-utils/src/index.js b/packages/spacecat-shared-utils/src/index.js index 14ba1ca61..2653a6150 100644 --- a/packages/spacecat-shared-utils/src/index.js +++ b/packages/spacecat-shared-utils/src/index.js @@ -75,3 +75,10 @@ export { } from './formcalc.js'; export { retrievePageAuthentication } from './auth.js'; + +export { + getScrapedContentPath, + storeScrapedContent, + scrapeUrl, + scrapeAndStoreUrls, +} from './scraper-utils.js'; From 06ffd72366a72d0252a7881d1540eea2bf1daac3 Mon Sep 17 00:00:00 2001 From: anagarwa Date: Tue, 10 Jun 2025 17:08:37 +0530 Subject: [PATCH 2/2] fix: scrapping content in api --- .../src/scraper-utils.js | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 packages/spacecat-shared-utils/src/scraper-utils.js diff --git a/packages/spacecat-shared-utils/src/scraper-utils.js b/packages/spacecat-shared-utils/src/scraper-utils.js new file mode 100644 index 000000000..8a59c1756 --- /dev/null +++ b/packages/spacecat-shared-utils/src/scraper-utils.js @@ -0,0 +1,120 @@ +import { PutObjectCommand } from '@aws-sdk/client-s3'; +import { hasText } from './functions.js'; + +/** + * Generates storage path for scraped content that matches run-sqs.js expectations + */ +export function getScrapedContentPath(siteId, url, prefix = 'scrapes') { + const urlObj = new URL(url); + const urlPath = urlObj.pathname.replace(/\/$/, '') || '/'; + return `${prefix}/${siteId}${urlPath}/scrape.json`; +} + +/** + * Stores scraped content in S3 in the format expected by run-sqs.js + */ +export async function storeScrapedContent(s3Client, bucketName, siteId, url, content, options = {}) { + const { prefix = 'scrapes' } = options; + + const filePath = getScrapedContentPath(siteId, url, prefix); + + const command = new PutObjectCommand({ + Bucket: bucketName, + Key: filePath, + Body: JSON.stringify(content), + ContentType: 'application/json', + }); + + await s3Client.send(command); + console.log(`Successfully stored scraped content at: ${filePath}`); + + return filePath; +} + +/** + * Simple web scraper function + */ +export async function scrapeUrl(url, options = {}) { + const { + customHeaders = {}, + timeout = 15000, + userAgent = 'SpaceCat-Scraper/1.0' + } = options; + + try { + const response = await fetch(url, { + headers: { + 'User-Agent': userAgent, + ...customHeaders, + }, + signal: AbortSignal.timeout(timeout), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const rawBody = await response.text(); + const finalUrl = response.url; + + return { + finalUrl, + status: response.status, + headers: Object.fromEntries(response.headers.entries()), + rawBody, + scrapeTime: Date.now(), + scrapedAt: new Date().toISOString(), + }; + } catch (error) { + throw new Error(`Failed to scrape ${url}: ${error.message}`); + } +} + +/** + * Batch scrape multiple URLs and store them in S3 + */ +export async function scrapeAndStoreUrls(s3Client, bucketName, siteId, urls, options = {}) { + const results = []; + + for (const url of urls) { + try { + console.log(`Scraping: ${url}`); + const scrapeResult = await scrapeUrl(url, options); + + const contentToStore = { + finalUrl: scrapeResult.finalUrl, + scrapeResult, + userAgent: options.userAgent || 'SpaceCat-Scraper/1.0', + scrapeTime: scrapeResult.scrapeTime, + scrapedAt: scrapeResult.scrapedAt, + }; + + const storagePath = await storeScrapedContent( + s3Client, + bucketName, + siteId, + url, + contentToStore, + options + ); + + results.push({ + url, + finalUrl: scrapeResult.finalUrl, + status: 'COMPLETE', + location: storagePath, + scrapeResult, + }); + + } catch (error) { + console.error(`Failed to scrape ${url}:`, error); + results.push({ + url, + status: 'FAILED', + error: error.message, + }); + } + } + + return results; +} \ No newline at end of file