added support for monolith
This commit is contained in:
+22
-171
@@ -7,9 +7,10 @@ import validateUrlSize from "./validateUrlSize";
|
||||
import createFolder from "./storage/createFolder";
|
||||
import generatePreview from "./generatePreview";
|
||||
import { removeFiles } from "./manageLinkFiles";
|
||||
import archiveAsSinglefile from "./preservationScheme/archiveAsSinglefile";
|
||||
import archiveAsReadability from "./preservationScheme/archiveAsReadablility";
|
||||
import shell from "shelljs";
|
||||
import handleMonolith from "./preservationScheme/handleMonolith";
|
||||
import handleReadablility from "./preservationScheme/handleReadablility";
|
||||
import handleArchivePreview from "./preservationScheme/handleArchivePreview";
|
||||
import handleScreenshotAndPdf from "./preservationScheme/handleScreenshotAndPdf";
|
||||
|
||||
type LinksAndCollectionAndOwner = Link & {
|
||||
collection: Collection & {
|
||||
@@ -51,26 +52,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
// await page.goto("https://github.com", {
|
||||
// waitUntil: "domcontentloaded",
|
||||
// });
|
||||
|
||||
// console.log("Opening page:", link.url);
|
||||
|
||||
// await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
|
||||
|
||||
// const dom = await page.content();
|
||||
|
||||
// console.log("The content", dom);
|
||||
|
||||
// shell
|
||||
// .echo(dom)
|
||||
// .exec(
|
||||
// "monolith - -I -b https://marketplace.visualstudio.com/items?itemName=42Crunch.vscode-openapi -j -F -o monolith.html"
|
||||
// );
|
||||
|
||||
// console.log("Monolith created!");
|
||||
|
||||
createFolder({
|
||||
filePath: `archives/preview/${link.collectionId}`,
|
||||
});
|
||||
@@ -131,15 +112,6 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||
},
|
||||
});
|
||||
|
||||
// SingleFile
|
||||
// if (
|
||||
// !link.singlefile?.startsWith("archive") &&
|
||||
// !link.singlefile?.startsWith("unavailable") &&
|
||||
// user.archiveAsSinglefile &&
|
||||
// link.url
|
||||
// )
|
||||
// await archiveAsSinglefile(link);
|
||||
|
||||
// send to archive.org
|
||||
if (user.archiveAsWaybackMachine && link.url) sendToWayback(link.url);
|
||||
|
||||
@@ -156,19 +128,19 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||
|
||||
const content = await page.content();
|
||||
|
||||
// Readability
|
||||
if (
|
||||
!link.readable?.startsWith("archives") &&
|
||||
!link.readable?.startsWith("unavailable")
|
||||
)
|
||||
await archiveAsReadability(content, link);
|
||||
|
||||
// Preview
|
||||
if (
|
||||
!link.preview?.startsWith("archives") &&
|
||||
!link.preview?.startsWith("unavailable")
|
||||
)
|
||||
await getArchivePreview(link, page);
|
||||
await handleArchivePreview(link, page);
|
||||
|
||||
// Readability
|
||||
if (
|
||||
!link.readable?.startsWith("archives") &&
|
||||
!link.readable?.startsWith("unavailable")
|
||||
)
|
||||
await handleReadablility(content, link);
|
||||
|
||||
// Screenshot/PDF
|
||||
if (
|
||||
@@ -177,7 +149,16 @@ export default async function archiveHandler(link: LinksAndCollectionAndOwner) {
|
||||
(!link.pdf?.startsWith("archives") &&
|
||||
!link.pdf?.startsWith("unavailable"))
|
||||
)
|
||||
await captureScreenshotAndPdf(link, page, user);
|
||||
await handleScreenshotAndPdf(link, page, user);
|
||||
|
||||
// SingleFile
|
||||
if (
|
||||
!link.singlefile?.startsWith("archive") &&
|
||||
!link.singlefile?.startsWith("unavailable") &&
|
||||
user.archiveAsSinglefile &&
|
||||
link.url
|
||||
)
|
||||
await handleMonolith(link, content);
|
||||
}
|
||||
})(),
|
||||
timeoutPromise,
|
||||
@@ -268,133 +249,3 @@ const pdfHandler = async ({ url, id }: Link) => {
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const getArchivePreview = async (
|
||||
link: LinksAndCollectionAndOwner,
|
||||
page: Page
|
||||
) => {
|
||||
const ogImageUrl = await page.evaluate(() => {
|
||||
const metaTag = document.querySelector('meta[property="og:image"]');
|
||||
return metaTag ? (metaTag as any).content : null;
|
||||
});
|
||||
|
||||
if (ogImageUrl) {
|
||||
console.log("Found og:image URL:", ogImageUrl);
|
||||
|
||||
// Download the image
|
||||
const imageResponse = await page.goto(ogImageUrl);
|
||||
|
||||
// Check if imageResponse is not null
|
||||
if (imageResponse && !link.preview?.startsWith("archive")) {
|
||||
const buffer = await imageResponse.body();
|
||||
generatePreview(buffer, link.collectionId, link.id);
|
||||
}
|
||||
|
||||
await page.goBack();
|
||||
} else if (!link.preview?.startsWith("archive")) {
|
||||
console.log("No og:image found");
|
||||
await page
|
||||
.screenshot({ type: "jpeg", quality: 20 })
|
||||
.then((screenshot) => {
|
||||
return createFile({
|
||||
data: screenshot,
|
||||
filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
||||
});
|
||||
})
|
||||
.then(() => {
|
||||
return prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const captureScreenshotAndPdf = async (
|
||||
link: LinksAndCollectionAndOwner,
|
||||
page: Page,
|
||||
user: User
|
||||
) => {
|
||||
await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
|
||||
|
||||
// Check if the user hasn't deleted the link by the time we're done scrolling
|
||||
const linkExists = await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
});
|
||||
if (linkExists) {
|
||||
const processingPromises = [];
|
||||
|
||||
if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) {
|
||||
processingPromises.push(
|
||||
page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => {
|
||||
return createFile({
|
||||
data: screenshot,
|
||||
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
||||
});
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
const margins = {
|
||||
top: process.env.PDF_MARGIN_TOP || "15px",
|
||||
bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
|
||||
};
|
||||
|
||||
if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
|
||||
processingPromises.push(
|
||||
page
|
||||
.pdf({
|
||||
width: "1366px",
|
||||
height: "1931px",
|
||||
printBackground: true,
|
||||
margin: margins,
|
||||
})
|
||||
.then((pdf) => {
|
||||
return createFile({
|
||||
data: pdf,
|
||||
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
||||
});
|
||||
})
|
||||
);
|
||||
}
|
||||
await Promise.allSettled(processingPromises);
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
image: user.archiveAsScreenshot
|
||||
? `archives/${linkExists.collectionId}/${link.id}.png`
|
||||
: undefined,
|
||||
pdf: user.archiveAsPDF
|
||||
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
||||
: undefined,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
||||
const timeoutPromise = new Promise<void>((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve();
|
||||
}, AUTOSCROLL_TIMEOUT * 1000);
|
||||
});
|
||||
|
||||
const scrollingPromise = new Promise<void>((resolve) => {
|
||||
let totalHeight = 0;
|
||||
let distance = 100;
|
||||
let scrollDown = setInterval(() => {
|
||||
let scrollHeight = document.body.scrollHeight;
|
||||
window.scrollBy(0, distance);
|
||||
totalHeight += distance;
|
||||
if (totalHeight >= scrollHeight) {
|
||||
clearInterval(scrollDown);
|
||||
window.scroll(0, 0);
|
||||
resolve();
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
|
||||
await Promise.race([scrollingPromise, timeoutPromise]);
|
||||
};
|
||||
|
||||
@@ -1,111 +0,0 @@
|
||||
import { execSync } from "child_process";
|
||||
import createFile from "../storage/createFile";
|
||||
import axios from "axios";
|
||||
import { Agent } from "http";
|
||||
import { prisma } from "../db";
|
||||
import { Link } from "@prisma/client";
|
||||
|
||||
const archiveAsSinglefile = async (link: Link) => {
|
||||
if (!link.url) return;
|
||||
|
||||
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
|
||||
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
|
||||
if (command) {
|
||||
if (command.includes("{{URL}}")) {
|
||||
try {
|
||||
let html = execSync(command.replace("{{URL}}", link.url), {
|
||||
timeout: 120000,
|
||||
maxBuffer: 1024 * 1024 * 30,
|
||||
});
|
||||
|
||||
if (!html.length) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const collectionId = (
|
||||
await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
select: { collectionId: true },
|
||||
})
|
||||
)?.collectionId;
|
||||
|
||||
if (!collectionId) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
await createFile({
|
||||
data: html,
|
||||
filePath: `archives/${collectionId}/${link.id}.html`,
|
||||
}).then(async () => {
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
singlefile: `archives/${collectionId}/${link.id}.html`,
|
||||
},
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
|
||||
}
|
||||
} else {
|
||||
console.error("Invalid SINGLEFILE_ARCHIVE_COMMAND. Missing {{URL}}");
|
||||
}
|
||||
} else if (httpApi) {
|
||||
try {
|
||||
let html = await axios.post(
|
||||
httpApi,
|
||||
{ url: link.url },
|
||||
{
|
||||
headers: {
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
},
|
||||
httpAgent: new Agent({ keepAlive: false }),
|
||||
}
|
||||
);
|
||||
|
||||
if (!html.data.length) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
|
||||
return;
|
||||
}
|
||||
|
||||
const collectionId = (
|
||||
await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
select: { collectionId: true },
|
||||
})
|
||||
)?.collectionId;
|
||||
|
||||
if (!collectionId) {
|
||||
console.error(
|
||||
"Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
await createFile({
|
||||
data: html.data,
|
||||
filePath: `archives/${collectionId}/${link.id}.html`,
|
||||
}).then(async () => {
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
singlefile: `archives/${collectionId}/${link.id}.html`,
|
||||
},
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(
|
||||
"Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
|
||||
err
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
export default archiveAsSinglefile;
|
||||
@@ -0,0 +1,56 @@
|
||||
import { Collection, Link, User } from "@prisma/client";
|
||||
import { Page } from "playwright";
|
||||
import generatePreview from "../generatePreview";
|
||||
import createFile from "../storage/createFile";
|
||||
import { prisma } from "../db";
|
||||
|
||||
type LinksAndCollectionAndOwner = Link & {
|
||||
collection: Collection & {
|
||||
owner: User;
|
||||
};
|
||||
};
|
||||
|
||||
const handleArchivePreview = async (
|
||||
link: LinksAndCollectionAndOwner,
|
||||
page: Page
|
||||
) => {
|
||||
const ogImageUrl = await page.evaluate(() => {
|
||||
const metaTag = document.querySelector('meta[property="og:image"]');
|
||||
return metaTag ? (metaTag as any).content : null;
|
||||
});
|
||||
|
||||
if (ogImageUrl) {
|
||||
console.log("Found og:image URL:", ogImageUrl);
|
||||
|
||||
// Download the image
|
||||
const imageResponse = await page.goto(ogImageUrl);
|
||||
|
||||
// Check if imageResponse is not null
|
||||
if (imageResponse && !link.preview?.startsWith("archive")) {
|
||||
const buffer = await imageResponse.body();
|
||||
generatePreview(buffer, link.collectionId, link.id);
|
||||
}
|
||||
|
||||
await page.goBack();
|
||||
} else if (!link.preview?.startsWith("archive")) {
|
||||
console.log("No og:image found");
|
||||
await page
|
||||
.screenshot({ type: "jpeg", quality: 20 })
|
||||
.then((screenshot) => {
|
||||
return createFile({
|
||||
data: screenshot,
|
||||
filePath: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
||||
});
|
||||
})
|
||||
.then(() => {
|
||||
return prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
preview: `archives/preview/${link.collectionId}/${link.id}.jpeg`,
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
export default handleArchivePreview;
|
||||
@@ -0,0 +1,97 @@
|
||||
import { execSync } from "child_process";
|
||||
import createFile from "../storage/createFile";
|
||||
import axios from "axios";
|
||||
import { Agent } from "http";
|
||||
import { prisma } from "../db";
|
||||
import { Link } from "@prisma/client";
|
||||
import { Page } from "playwright";
|
||||
|
||||
const handleMonolith = async (link: Link, content: string) => {
|
||||
if (!link.url) return;
|
||||
|
||||
let command = process.env.SINGLEFILE_ARCHIVE_COMMAND;
|
||||
let httpApi = process.env.SINGLEFILE_ARCHIVE_HTTP_API;
|
||||
try {
|
||||
let html = execSync(
|
||||
`monolith - -I -b ${link.url} ${
|
||||
process.env.MONOLITH_OPTIONS || "-j -F -s"
|
||||
} -o -`,
|
||||
{
|
||||
timeout: 120000,
|
||||
maxBuffer: 1024 * 1024 * Number(process.env.MONOLITH_MAX_BUFFER || 5),
|
||||
input: content,
|
||||
}
|
||||
);
|
||||
|
||||
if (!html?.length) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
|
||||
return;
|
||||
}
|
||||
|
||||
await createFile({
|
||||
data: html,
|
||||
filePath: `archives/${link.collectionId}/${link.id}.html`,
|
||||
}).then(async () => {
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
singlefile: `archives/${link.collectionId}/${link.id}.html`,
|
||||
},
|
||||
});
|
||||
});
|
||||
} catch (err) {
|
||||
console.error("Error running SINGLEFILE_ARCHIVE_COMMAND:", err);
|
||||
}
|
||||
// if (httpApi) {
|
||||
// try {
|
||||
// let html = await axios.post(
|
||||
// httpApi,
|
||||
// { url: link.url },
|
||||
// {
|
||||
// headers: {
|
||||
// "Content-Type": "application/x-www-form-urlencoded",
|
||||
// },
|
||||
// httpAgent: new Agent({ keepAlive: false }),
|
||||
// }
|
||||
// );
|
||||
|
||||
// if (!html.data.length) {
|
||||
// console.error("Error running SINGLEFILE_ARCHIVE_COMMAND: Empty buffer");
|
||||
// return;
|
||||
// }
|
||||
|
||||
// const collectionId = (
|
||||
// await prisma.link.findUnique({
|
||||
// where: { id: link.id },
|
||||
// select: { collectionId: true },
|
||||
// })
|
||||
// )?.collectionId;
|
||||
|
||||
// if (!collectionId) {
|
||||
// console.error(
|
||||
// "Error running SINGLEFILE_ARCHIVE_COMMAND: Collection ID not found"
|
||||
// );
|
||||
// return;
|
||||
// }
|
||||
|
||||
// await createFile({
|
||||
// data: html.data,
|
||||
// filePath: `archives/${collectionId}/${link.id}.html`,
|
||||
// }).then(async () => {
|
||||
// await prisma.link.update({
|
||||
// where: { id: link.id },
|
||||
// data: {
|
||||
// singlefile: `archives/${collectionId}/${link.id}.html`,
|
||||
// },
|
||||
// });
|
||||
// });
|
||||
// } catch (err) {
|
||||
// console.error(
|
||||
// "Error fetching Singlefile using SINGLEFILE_ARCHIVE_HTTP_API:",
|
||||
// err
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
};
|
||||
|
||||
export default handleMonolith;
|
||||
+2
-2
@@ -5,7 +5,7 @@ import { prisma } from "../db";
|
||||
import createFile from "../storage/createFile";
|
||||
import { Link } from "@prisma/client";
|
||||
|
||||
const archiveAsReadablility = async (content: string, link: Link) => {
|
||||
const handleReadablility = async (content: string, link: Link) => {
|
||||
const window = new JSDOM("").window;
|
||||
const purify = DOMPurify(window);
|
||||
const cleanedUpContent = purify.sanitize(content);
|
||||
@@ -38,4 +38,4 @@ const archiveAsReadablility = async (content: string, link: Link) => {
|
||||
}
|
||||
};
|
||||
|
||||
export default archiveAsReadablility;
|
||||
export default handleReadablility;
|
||||
@@ -0,0 +1,98 @@
|
||||
import { Collection, Link, User } from "@prisma/client";
|
||||
import { Page } from "playwright";
|
||||
import createFile from "../storage/createFile";
|
||||
import { prisma } from "../db";
|
||||
|
||||
type LinksAndCollectionAndOwner = Link & {
|
||||
collection: Collection & {
|
||||
owner: User;
|
||||
};
|
||||
};
|
||||
const handleScreenshotAndPdf = async (
|
||||
link: LinksAndCollectionAndOwner,
|
||||
page: Page,
|
||||
user: User
|
||||
) => {
|
||||
await page.evaluate(autoScroll, Number(process.env.AUTOSCROLL_TIMEOUT) || 30);
|
||||
|
||||
// Check if the user hasn't deleted the link by the time we're done scrolling
|
||||
const linkExists = await prisma.link.findUnique({
|
||||
where: { id: link.id },
|
||||
});
|
||||
if (linkExists) {
|
||||
const processingPromises = [];
|
||||
|
||||
if (user.archiveAsScreenshot && !link.image?.startsWith("archive")) {
|
||||
processingPromises.push(
|
||||
page.screenshot({ fullPage: true, type: "png" }).then((screenshot) => {
|
||||
return createFile({
|
||||
data: screenshot,
|
||||
filePath: `archives/${linkExists.collectionId}/${link.id}.png`,
|
||||
});
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
const margins = {
|
||||
top: process.env.PDF_MARGIN_TOP || "15px",
|
||||
bottom: process.env.PDF_MARGIN_BOTTOM || "15px",
|
||||
};
|
||||
|
||||
if (user.archiveAsPDF && !link.pdf?.startsWith("archive")) {
|
||||
processingPromises.push(
|
||||
page
|
||||
.pdf({
|
||||
width: "1366px",
|
||||
height: "1931px",
|
||||
printBackground: true,
|
||||
margin: margins,
|
||||
})
|
||||
.then((pdf) => {
|
||||
return createFile({
|
||||
data: pdf,
|
||||
filePath: `archives/${linkExists.collectionId}/${link.id}.pdf`,
|
||||
});
|
||||
})
|
||||
);
|
||||
}
|
||||
await Promise.allSettled(processingPromises);
|
||||
await prisma.link.update({
|
||||
where: { id: link.id },
|
||||
data: {
|
||||
image: user.archiveAsScreenshot
|
||||
? `archives/${linkExists.collectionId}/${link.id}.png`
|
||||
: undefined,
|
||||
pdf: user.archiveAsPDF
|
||||
? `archives/${linkExists.collectionId}/${link.id}.pdf`
|
||||
: undefined,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const autoScroll = async (AUTOSCROLL_TIMEOUT: number) => {
|
||||
const timeoutPromise = new Promise<void>((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve();
|
||||
}, AUTOSCROLL_TIMEOUT * 1000);
|
||||
});
|
||||
|
||||
const scrollingPromise = new Promise<void>((resolve) => {
|
||||
let totalHeight = 0;
|
||||
let distance = 100;
|
||||
let scrollDown = setInterval(() => {
|
||||
let scrollHeight = document.body.scrollHeight;
|
||||
window.scrollBy(0, distance);
|
||||
totalHeight += distance;
|
||||
if (totalHeight >= scrollHeight) {
|
||||
clearInterval(scrollDown);
|
||||
window.scroll(0, 0);
|
||||
resolve();
|
||||
}
|
||||
}, 100);
|
||||
});
|
||||
|
||||
await Promise.race([scrollingPromise, timeoutPromise]);
|
||||
};
|
||||
|
||||
export default handleScreenshotAndPdf;
|
||||
@@ -33,7 +33,7 @@ export default async function validateUrlSize(url: string) {
|
||||
|
||||
const totalSizeMB =
|
||||
Number(response.headers.get("content-length")) / Math.pow(1024, 2);
|
||||
if (totalSizeMB > (Number(process.env.NEXT_PUBLIC_MAX_FILE_SIZE) || 30))
|
||||
if (totalSizeMB > Number(process.env.NEXT_PUBLIC_MAX_FILE_BUFFER || 10))
|
||||
return null;
|
||||
else return response.headers;
|
||||
} catch (err) {
|
||||
|
||||
Reference in New Issue
Block a user