Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions packages/spacecat-shared-utils/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,10 @@ export {
} from './formcalc.js';

export { retrievePageAuthentication } from './auth.js';

export {
getScrapedContentPath,
storeScrapedContent,
scrapeUrl,
scrapeAndStoreUrls,
} from './scraper-utils.js';
120 changes: 120 additions & 0 deletions packages/spacecat-shared-utils/src/scraper-utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import { PutObjectCommand } from '@aws-sdk/client-s3';

Check failure on line 1 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

missing header
import { hasText } from './functions.js';

Check failure on line 2 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

'hasText' is defined but never used

Check notice

Code scanning / CodeQL

Unused variable, import, function or class Note

Unused import hasText.

Copilot Autofix

AI 4 months ago

To fix the problem, the unused import hasText should be removed from the file. This will eliminate the unnecessary clutter and improve code readability. The change is straightforward and involves deleting the import statement for hasText.

Suggested changeset 1
packages/spacecat-shared-utils/src/scraper-utils.js

Autofix patch

Autofix patch
Run the following command in your local git repository to apply this patch
cat << 'EOF' | git apply
diff --git a/packages/spacecat-shared-utils/src/scraper-utils.js b/packages/spacecat-shared-utils/src/scraper-utils.js
--- a/packages/spacecat-shared-utils/src/scraper-utils.js
+++ b/packages/spacecat-shared-utils/src/scraper-utils.js
@@ -1,3 +1,2 @@
 import { PutObjectCommand } from '@aws-sdk/client-s3';
-import { hasText } from './functions.js';
 
EOF
@@ -1,3 +1,2 @@
import { PutObjectCommand } from '@aws-sdk/client-s3';
import { hasText } from './functions.js';

Copilot is powered by AI and may make mistakes. Always verify output.

/**
* Generates storage path for scraped content that matches run-sqs.js expectations
*/
export function getScrapedContentPath(siteId, url, prefix = 'scrapes') {
const urlObj = new URL(url);

Check failure on line 8 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4
const urlPath = urlObj.pathname.replace(/\/$/, '') || '/';

Check failure on line 9 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4
return `${prefix}/${siteId}${urlPath}/scrape.json`;

Check failure on line 10 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4
}

/**
* Stores scraped content in S3 in the format expected by run-sqs.js
*/
export async function storeScrapedContent(s3Client, bucketName, siteId, url, content, options = {}) {

Check failure on line 16 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

This line has a length of 101. Maximum allowed is 100
const { prefix = 'scrapes' } = options;

Check failure on line 17 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4

const filePath = getScrapedContentPath(siteId, url, prefix);

Check failure on line 19 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4

const command = new PutObjectCommand({

Check failure on line 21 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 2 spaces but found 4
Bucket: bucketName,

Check failure on line 22 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Expected indentation of 4 spaces but found 8
Key: filePath,
Body: JSON.stringify(content),
ContentType: 'application/json',
});

await s3Client.send(command);
console.log(`Successfully stored scraped content at: ${filePath}`);

Check warning on line 29 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Unexpected console statement

return filePath;
}

/**
* Simple web scraper function
*/
export async function scrapeUrl(url, options = {}) {
const {
customHeaders = {},
timeout = 15000,
userAgent = 'SpaceCat-Scraper/1.0'
} = options;

try {
const response = await fetch(url, {
headers: {
'User-Agent': userAgent,
...customHeaders,
},
signal: AbortSignal.timeout(timeout),
});

if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}

const rawBody = await response.text();
const finalUrl = response.url;

return {
finalUrl,
status: response.status,
headers: Object.fromEntries(response.headers.entries()),
rawBody,
scrapeTime: Date.now(),
scrapedAt: new Date().toISOString(),
};
} catch (error) {
throw new Error(`Failed to scrape ${url}: ${error.message}`);
}
}

/**
* Batch scrape multiple URLs and store them in S3
*/
export async function scrapeAndStoreUrls(s3Client, bucketName, siteId, urls, options = {}) {
const results = [];

for (const url of urls) {
try {
console.log(`Scraping: ${url}`);

Check warning on line 81 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Unexpected console statement
const scrapeResult = await scrapeUrl(url, options);

const contentToStore = {
finalUrl: scrapeResult.finalUrl,
scrapeResult,
userAgent: options.userAgent || 'SpaceCat-Scraper/1.0',
scrapeTime: scrapeResult.scrapeTime,
scrapedAt: scrapeResult.scrapedAt,
};

const storagePath = await storeScrapedContent(
s3Client,
bucketName,
siteId,
url,
contentToStore,
options
);

results.push({
url,
finalUrl: scrapeResult.finalUrl,
status: 'COMPLETE',
location: storagePath,
scrapeResult,
});

} catch (error) {
console.error(`Failed to scrape ${url}:`, error);

Check warning on line 110 in packages/spacecat-shared-utils/src/scraper-utils.js

View workflow job for this annotation

GitHub Actions / Test

Unexpected console statement
results.push({
url,
status: 'FAILED',
error: error.message,
});
}
}

return results;
}