added auto-archive script + minor improvements

This commit is contained in:
daniel31x13
2023-12-10 15:26:44 -05:00
parent 8e49ccf723
commit 375a55dd37
11 changed files with 373 additions and 29 deletions
+168
View File
@@ -0,0 +1,168 @@
import { chromium, devices } from "playwright";
import { prisma } from "../../lib/api/db";
import createFile from "../../lib/api/storage/createFile";
import sendToWayback from "../../lib/api/sendToWayback";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import DOMPurify from "dompurify";
export default async function urlHandler(
linkId: number,
url: string,
userId: number
) {
const user = await prisma.user.findUnique({ where: { id: userId } });
const targetLink = await prisma.link.update({
where: { id: linkId },
data: {
screenshotPath: user?.archiveAsScreenshot ? "pending" : null,
pdfPath: user?.archiveAsPDF ? "pending" : null,
readabilityPath: "pending",
lastPreserved: new Date().toISOString(),
},
});
// Archive.org
if (user?.archiveAsWaybackMachine) sendToWayback(url);
if (user?.archiveAsPDF || user?.archiveAsScreenshot) {
const browser = await chromium.launch();
const context = await browser.newContext(devices["Desktop Chrome"]);
const page = await context.newPage();
try {
await page.goto(url, { waitUntil: "domcontentloaded" });
const content = await page.content();
// TODO
// const session = await page.context().newCDPSession(page);
// const doc = await session.send("Page.captureSnapshot", {
// format: "mhtml",
// });
// const saveDocLocally = (doc: any) => {
// console.log(doc);
// return createFile({
// data: doc,
// filePath: `archives/${targetLink.collectionId}/${linkId}.mhtml`,
// });
// };
// saveDocLocally(doc.data);
// Readability
const window = new JSDOM("").window;
const purify = DOMPurify(window);
const cleanedUpContent = purify.sanitize(content);
const dom = new JSDOM(cleanedUpContent, { url: url });
const article = new Readability(dom.window.document).parse();
const articleText = article?.textContent
.replace(/ +(?= )/g, "") // strip out multiple spaces
.replace(/(\r\n|\n|\r)/gm, " "); // strip out line breaks
await createFile({
data: JSON.stringify(article),
filePath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
});
await prisma.link.update({
where: { id: linkId },
data: {
readabilityPath: `archives/${targetLink.collectionId}/${linkId}_readability.json`,
textContent: articleText,
},
});
// Screenshot/PDF
let faulty = false;
await page
.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30)
.catch((e) => (faulty = true));
const linkExists = await prisma.link.findUnique({
where: { id: linkId },
});
if (linkExists && !faulty) {
if (user.archiveAsScreenshot) {
const screenshot = await page.screenshot({ fullPage: true });
await createFile({
data: screenshot,
filePath: `archives/${linkExists.collectionId}/${linkId}.png`,
});
}
if (user.archiveAsPDF) {
const pdf = await page.pdf({
width: "1366px",
height: "1931px",
printBackground: true,
margin: { top: "15px", bottom: "15px" },
});
await createFile({
data: pdf,
filePath: `archives/${linkExists.collectionId}/${linkId}.pdf`,
});
}
await prisma.link.update({
where: { id: linkId },
data: {
screenshotPath: user.archiveAsScreenshot
? `archives/${linkExists.collectionId}/${linkId}.png`
: null,
pdfPath: user.archiveAsPDF
? `archives/${linkExists.collectionId}/${linkId}.pdf`
: null,
},
});
} else if (faulty) {
await prisma.link.update({
where: { id: linkId },
data: {
screenshotPath: null,
pdfPath: null,
},
});
}
} catch (err) {
console.log(err);
throw err;
} finally {
await browser.close();
}
}
}
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
const timeoutPromise = new Promise<void>((_, reject) => {
setTimeout(() => {
reject(new Error(`Webpage was too long to be archived.`));
}, AUTOSCROLL_TIMEOUT * 1000);
});
const scrollingPromise = new Promise<void>((resolve) => {
let totalHeight = 0;
let distance = 100;
let scrollDown = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(scrollDown);
window.scroll(0, 0);
resolve();
}
}, 100);
});
await Promise.race([scrollingPromise, timeoutPromise]);
};
+122
View File
@@ -0,0 +1,122 @@
const { S3 } = require("@aws-sdk/client-s3");
const { PrismaClient } = require("@prisma/client");
const { existsSync } = require("fs");
const util = require("util");
const prisma = new PrismaClient();
const STORAGE_FOLDER = process.env.STORAGE_FOLDER || "data";
const s3Client =
process.env.SPACES_ENDPOINT &&
process.env.SPACES_REGION &&
process.env.SPACES_KEY &&
process.env.SPACES_SECRET
? new S3({
forcePathStyle: false,
endpoint: process.env.SPACES_ENDPOINT,
region: process.env.SPACES_REGION,
credentials: {
accessKeyId: process.env.SPACES_KEY,
secretAccessKey: process.env.SPACES_SECRET,
},
})
: undefined;
async function checkFileExistence(path) {
if (s3Client) {
const bucketParams = {
Bucket: process.env.BUCKET_NAME,
Key: path,
};
try {
const headObjectAsync = util.promisify(
s3Client.headObject.bind(s3Client)
);
try {
await headObjectAsync(bucketParams);
return true;
} catch (err) {
return false;
}
} catch (err) {
console.log("Error:", err);
return false;
}
} else {
try {
if (existsSync(STORAGE_FOLDER + "/" + path)) {
return true;
} else return false;
} catch (err) {
console.log(err);
}
}
}
// Avatars
async function migrateToV2() {
const users = await prisma.user.findMany();
for (let user of users) {
const path = `uploads/avatar/${user.id}.jpg`;
const res = await checkFileExistence(path);
if (res) {
await prisma.user.update({
where: { id: user.id },
data: { image: path },
});
console.log(`${user.id}`);
} else {
console.log(`${user.id}`);
}
}
const links = await prisma.link.findMany();
// PDFs
for (let link of links) {
const path = `archives/${link.collectionId}/${link.id}.pdf`;
const res = await checkFileExistence(path);
if (res) {
await prisma.link.update({
where: { id: link.id },
data: { pdfPath: path },
});
console.log(`${link.id}`);
} else {
console.log(`${link.id}`);
}
}
// Screenshots
for (let link of links) {
const path = `archives/${link.collectionId}/${link.id}.png`;
const res = await checkFileExistence(path);
if (res) {
await prisma.link.update({
where: { id: link.id },
data: { screenshotPath: path },
});
console.log(`${link.id}`);
} else {
console.log(`${link.id}`);
}
}
await prisma.$disconnect();
}
migrateToV2().catch((e) => {
console.error(e);
process.exit(1);
});
-13
View File
@@ -1,13 +0,0 @@
import shell from "shelljs";
import urlHandler from "../lib/api/urlHandler";
const command = process.argv[2];
const args = process.argv.slice(3).join(" ");
if (!command) {
console.log("Please provide a command to run. (start, dev, etc.)");
process.exit(1);
}
shell.exec(`yarn ${command || ""} ${args || ""}`);
+82
View File
@@ -0,0 +1,82 @@
import { prisma } from "../lib/api/db";
import urlHandler from "./lib/urlHandler";
const args = process.argv.slice(2).join(" ");
const archiveTakeCount = Number(process.env.ARCHIVE_TAKE_COUNT || "") || 1;
// Function to process links for a given user
async function processLinksForUser() {
// Fetch the first 'maxLinksPerUser' links for the user
const links = await prisma.link.findMany({
where: {
OR: [
{
collection: {
owner: {
archiveAsScreenshot: true,
},
},
screenshotPath: null,
},
{
collection: {
owner: {
archiveAsPDF: true,
},
},
pdfPath: null,
},
{
readabilityPath: null,
},
],
collection: {
owner: {
archiveAsPDF: true,
archiveAsScreenshot: true,
},
},
},
take: archiveTakeCount,
orderBy: { createdAt: "asc" },
include: {
collection: true,
},
});
// Process each link using the urlHandler function
for (const link of links) {
try {
console.log(
`Processing link ${link.id} for user ${link.collection.ownerId}`
);
await urlHandler(link.id, link.url || "", link.collection.ownerId);
} catch (error) {
console.error(
`Error processing link ${link.id} for user ${link.collection.ownerId}:`,
error
);
}
}
}
const intervalInMinutes = 10; // Set the interval for the worker to run
// Main function to iterate over all users and process their links
async function processLinksForAllUsers() {
console.log("Starting the link processing task");
try {
const users = await prisma.user.findMany(); // Fetch all users
for (const user of users) {
await processLinksForUser(); // Process links for each user
}
} catch (error) {
console.error("Error processing links for users:", error);
}
setTimeout(processLinksForAllUsers, intervalInMinutes * 60000);
}
// Initial run
processLinksForAllUsers();