added support for monolith

This commit is contained in:
daniel31x13
2024-06-27 12:39:03 -04:00
parent afd5e5f036
commit 9fa9fe5db0
18 changed files with 312 additions and 775 deletions
+22 -171
View File
@@ -7,9 +7,10 @@ import validateUrlSize from "./validateUrlSize";
import createFolder from "./storage/createFolder";
import generatePreview from "./generatePreview";
import { removeFiles } from "./manageLinkFiles";
import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile";
import archiveAsReadability from "./preservationScheme/archiveAsReadablility";
import shell from "shelljs";
import handleMonolith from "./preservationScheme/handleMonolith";
import handleReadablility from "./preservationScheme/handleReadablility";
import handleArchivePreview from "./preservationScheme/handleArchivePreview";
import handleScreenshotAndPdf from "./preservationScheme/handleScreenshotAndPdf";
type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
@@ -51,26 +52,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
const page = await context.newPage();
// await page.goto("https://github.com", {
// waitUntil: "domcontentloaded",
// });
// console.log("Opening page:", link.url);
// await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
// const dom = await page.content();
// console.log("The content", dom);
// shell
// .echo(dom)
// .exec(
// "monolith - -I -b https://marketplace.visualstudio.com/items?itemName=42Crunch.vscode-openapi -j -F -o monolith.html"
// );
// console.log("Monolith created!");
createFolder({
filePath: `archives/preview/${link.collectionId}`,
});
@@ -131,15 +112,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
},
});
// SingleFile
// if (
// !link.singlefile?.startsWith("archive") &&
// !link.singlefile?.startsWith("unavailable") &&
// user.archiveAsSinglefile &&
// link.url
// )
// await archiveAsSinglefile(link);
// send to archive.org
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
@@ -156,19 +128,19 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
const content = await page.content();
// Readability
if (
!link.readable?.startsWith("archives") &&
!link.readable?.startsWith("unavailable")
)
await archiveAsReadability(content, link);
// Preview
if (
!link.preview?.startsWith("archives") &&
!link.preview?.startsWith("unavailable")
)
await getArchivePreview(link, page);
await handleArchivePreview(link, page);
// Readability
if (
!link.readable?.startsWith("archives") &&
!link.readable?.startsWith("unavailable")
)
await handleReadablility(content, link);
// Screenshot/PDF
if (
@@ -177,7 +149,16 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
(!link.pdf?.startsWith("archives") &&
!link.pdf?.startsWith("unavailable"))
)
await captureScreenshotAndPdf(link, page, user);
await handleScreenshotAndPdf(link, page, user);
// SingleFile
if (
!link.singlefile?.startsWith("archive") &&
!link.singlefile?.startsWith("unavailable") &&
user.archiveAsSinglefile &&
link.url
)
await handleMonolith(link, content);
}
})(),
timeoutPromise,
@@ -268,133 +249,3 @@ const pdfHandler = async ({ url, id }: Link) => {
});
}
};
const getArchivePreview = async (
link: LinksAndCollectionAndOwner,
page: Page
) => {
const ogImageUrl = await page.evaluate(() => {
const metaTag = document.querySelector('meta[property="og:image"]');
return metaTag ? (metaTag as any).content : null;
});
if (ogImageUrl) {
console.log("Found og:image URL:", ogImageUrl);
// Download the image
const imageResponse = await page.goto(ogImageUrl);
// Check if imageResponse is not null
if (imageResponse && !link.preview?.startsWith("archive")) {
const buffer = await imageResponse.body();
generatePreview(buffer, link.collectionId, link.id);
}
await page.goBack();
} else if (!link.preview?.startsWith("archive")) {
console.log("No og:image found");
await page
.screenshot({ type: "jpeg", quality: 20 })
.then((screenshot) => {
return createFile({
data: screenshot,
filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
});
})
.then(() => {
return prisma.link.update({
where: { id: link.id },
data: {
preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
},
});
});
}
};
const captureScreenshotAndPdf = async (
link: LinksAndCollectionAndOwner,
page: Page,
user: User
) => {
await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
// Check if the user hasn't deleted the link by the time we're done scrolling
const linkExists = await prisma.link.findUnique({
where: { id: link.id },
});
if (linkExists) {
const processingPromises = [];
if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) {
processingPromises.push(
page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => {
return createFile({
data: screenshot,
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
});
})
);
}
const margins = {
top: process.env.PDF_MARGIN_TOP || "15px",
bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
};
if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
processingPromises.push(
page
.pdf({
width: "1366px",
height: "1931px",
printBackground: true,
margin: margins,
})
.then((pdf) => {
return createFile({
data: pdf,
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
});
})
);
}
await Promise.allSettled(processingPromises);
await prisma.link.update({
where: { id: link.id },
data: {
image: user.archiveAsScreenshot
? `archives/${linkExists.collectionId}/${link.id}.png`
: undefined,
pdf: user.archiveAsPDF
? `archives/${linkExists.collectionId}/${link.id}.pdf`
: undefined,
},
});
}
};
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
const timeoutPromise = new Promise<void>((resolve) => {
setTimeout(() => {
resolve();
}, AUTOSCROLL_TIMEOUT * 1000);
});
const scrollingPromise = new Promise<void>((resolve) => {
let totalHeight = 0;
let distance = 100;
let scrollDown = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(scrollDown);
window.scroll(0, 0);
resolve();
}
}, 100);
});
await Promise.race([scrollingPromise, timeoutPromise]);
};
@@ -1,111 +0,0 @@
import { execSync } from "child_process";
import createFile from "../storage/createFile";
import axios from "axios";
import { Agent } from "http";
import { prisma } from "../db";
import { Link } from "@prisma/client";
const archiveAsSinglefile = async (link: Link) => {
if (!link.url) return;
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
if (command) {
if (command.includes("{{URL}}")) {
try {
let html = execSync(command.replace("{{URL}}", link.url), {
timeout: 120000,
maxBuffer: 1024 * 1024 * 30,
});
if (!html.length) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"
);
return;
}
const collectionId = (
await prisma.link.findUnique({
where: { id: link.id },
select: { collectionId: true },
})
)?.collectionId;
if (!collectionId) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
);
return;
}
await createFile({
data: html,
filePath: `archives/${collectionId}/${link.id}.html`,
}).then(async () => {
await prisma.link.update({
where: { id: link.id },
data: {
singlefile: `archives/${collectionId}/${link.id}.html`,
},
});
});
} catch (err) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
}
} else {
console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}");
}
} else if (httpApi) {
try {
let html = await axios.post(
httpApi,
{ url: link.url },
{
headers: {
"Content-Type": "application/x-www-form-urlencoded",
},
httpAgent: new Agent({ keepAlive: false }),
}
);
if (!html.data.length) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
return;
}
const collectionId = (
await prisma.link.findUnique({
where: { id: link.id },
select: { collectionId: true },
})
)?.collectionId;
if (!collectionId) {
console.error(
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
);
return;
}
await createFile({
data: html.data,
filePath: `archives/${collectionId}/${link.id}.html`,
}).then(async () => {
await prisma.link.update({
where: { id: link.id },
data: {
singlefile: `archives/${collectionId}/${link.id}.html`,
},
});
});
} catch (err) {
console.error(
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
err
);
}
}
};
export default archiveAsSinglefile;
@@ -0,0 +1,56 @@
import { Collection, Link, User } from "@prisma/client";
import { Page } from "playwright";
import generatePreview from "../generatePreview";
import createFile from "../storage/createFile";
import { prisma } from "../db";
type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
owner: User;
};
};
const handleArchivePreview = async (
link: LinksAndCollectionAndOwner,
page: Page
) => {
const ogImageUrl = await page.evaluate(() => {
const metaTag = document.querySelector('meta[property="og:image"]');
return metaTag ? (metaTag as any).content : null;
});
if (ogImageUrl) {
console.log("Found og:image URL:", ogImageUrl);
// Download the image
const imageResponse = await page.goto(ogImageUrl);
// Check if imageResponse is not null
if (imageResponse && !link.preview?.startsWith("archive")) {
const buffer = await imageResponse.body();
generatePreview(buffer, link.collectionId, link.id);
}
await page.goBack();
} else if (!link.preview?.startsWith("archive")) {
console.log("No og:image found");
await page
.screenshot({ type: "jpeg", quality: 20 })
.then((screenshot) => {
return createFile({
data: screenshot,
filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
});
})
.then(() => {
return prisma.link.update({
where: { id: link.id },
data: {
preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
},
});
});
}
};
export default handleArchivePreview;
@@ -0,0 +1,97 @@
import { execSync } from "child_process";
import createFile from "../storage/createFile";
import axios from "axios";
import { Agent } from "http";
import { prisma } from "../db";
import { Link } from "@prisma/client";
import { Page } from "playwright";
const handleMonolith = async (link: Link, content: string) => {
if (!link.url) return;
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
try {
let html = execSync(
`monolith - -I -b ${link.url} ${
process.env.MONOLITH_OPTIONS || "-j -F -s"
} -o -`,
{
timeout: 120000,
maxBuffer: 1024 * 1024 * Number(process.env.MONOLITH_MAX_BUFFER || 5),
input: content,
}
);
if (!html?.length) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
return;
}
await createFile({
data: html,
filePath: `archives/${link.collectionId}/${link.id}.html`,
}).then(async () => {
await prisma.link.update({
where: { id: link.id },
data: {
singlefile: `archives/${link.collectionId}/${link.id}.html`,
},
});
});
} catch (err) {
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
}
// if (httpApi) {
// try {
// let html = await axios.post(
// httpApi,
// { url: link.url },
// {
// headers: {
// "Content-Type": "application/x-www-form-urlencoded",
// },
// httpAgent: new Agent({ keepAlive: false }),
// }
// );
// if (!html.data.length) {
// console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
// return;
// }
// const collectionId = (
// await prisma.link.findUnique({
// where: { id: link.id },
// select: { collectionId: true },
// })
// )?.collectionId;
// if (!collectionId) {
// console.error(
// "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
// );
// return;
// }
// await createFile({
// data: html.data,
// filePath: `archives/${collectionId}/${link.id}.html`,
// }).then(async () => {
// await prisma.link.update({
// where: { id: link.id },
// data: {
// singlefile: `archives/${collectionId}/${link.id}.html`,
// },
// });
// });
// } catch (err) {
// console.error(
// "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
// err
// );
// }
// }
};
export default handleMonolith;
@@ -5,7 +5,7 @@ import { prisma } from "../db";
import createFile from "../storage/createFile";
import { Link } from "@prisma/client";
const archiveAsReadablility = async (content: string, link: Link) => {
const handleReadablility = async (content: string, link: Link) => {
const window = new JSDOM("").window;
const purify = DOMPurify(window);
const cleanedUpContent = purify.sanitize(content);
@@ -38,4 +38,4 @@ const archiveAsReadablility = async (content: string, link: Link) => {
}
};
export default archiveAsReadablility;
export default handleReadablility;
@@ -0,0 +1,98 @@
import { Collection, Link, User } from "@prisma/client";
import { Page } from "playwright";
import createFile from "../storage/createFile";
import { prisma } from "../db";
type LinksAndCollectionAndOwner = Link & {
collection: Collection & {
owner: User;
};
};
const handleScreenshotAndPdf = async (
link: LinksAndCollectionAndOwner,
page: Page,
user: User
) => {
await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
// Check if the user hasn't deleted the link by the time we're done scrolling
const linkExists = await prisma.link.findUnique({
where: { id: link.id },
});
if (linkExists) {
const processingPromises = [];
if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) {
processingPromises.push(
page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => {
return createFile({
data: screenshot,
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
});
})
);
}
const margins = {
top: process.env.PDF_MARGIN_TOP || "15px",
bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
};
if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
processingPromises.push(
page
.pdf({
width: "1366px",
height: "1931px",
printBackground: true,
margin: margins,
})
.then((pdf) => {
return createFile({
data: pdf,
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
});
})
);
}
await Promise.allSettled(processingPromises);
await prisma.link.update({
where: { id: link.id },
data: {
image: user.archiveAsScreenshot
? `archives/${linkExists.collectionId}/${link.id}.png`
: undefined,
pdf: user.archiveAsPDF
? `archives/${linkExists.collectionId}/${link.id}.pdf`
: undefined,
},
});
}
};
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
const timeoutPromise = new Promise<void>((resolve) => {
setTimeout(() => {
resolve();
}, AUTOSCROLL_TIMEOUT * 1000);
});
const scrollingPromise = new Promise<void>((resolve) => {
let totalHeight = 0;
let distance = 100;
let scrollDown = setInterval(() => {
let scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if (totalHeight >= scrollHeight) {
clearInterval(scrollDown);
window.scroll(0, 0);
resolve();
}
}, 100);
});
await Promise.race([scrollingPromise, timeoutPromise]);
};
export default handleScreenshotAndPdf;
+1 -1
View File
@@ -33,7 +33,7 @@ export default async function validateUrlSize(url: string) {
const totalSizeMB =
Number(response.headers.get("content-length")) / Math.pow(1024, 2);
if (totalSizeMB > (Number(process.env.NEXT_PUBLIC_MAX_FILE_SIZE) || 30))
if (totalSizeMB > Number(process.env.NEXT_PUBLIC_MAX_FILE_BUFFER || 10))
return null;
else return response.headers;
} catch (err) {