01c6cafd23
Implementa ToMarkdown() para convertir HTML a Markdown usando Turndown.js inline. Incluye: - Soporte para títulos, enlaces, imágenes, listas, tablas - Opciones para incluir/excluir imágenes y enlaces - Selector CSS opcional para convertir secciones específicas - Comando CLI to_markdown.go para uso directo Archivo: pkg/browser/markdown.go, cmd/to_markdown.go
239 lines
6.1 KiB
Go
239 lines
6.1 KiB
Go
package browser
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
)
|
|
|
|
// MarkdownOptions opciones para conversión a Markdown
|
|
type MarkdownOptions struct {
|
|
Selector string // Selector CSS opcional para convertir solo una parte
|
|
IncludeImages bool // Incluir imágenes en el output
|
|
IncludeLinks bool // Incluir enlaces (default: true)
|
|
}
|
|
|
|
// DefaultMarkdownOptions retorna opciones por defecto
|
|
func DefaultMarkdownOptions() *MarkdownOptions {
|
|
return &MarkdownOptions{
|
|
Selector: "",
|
|
IncludeImages: true,
|
|
IncludeLinks: true,
|
|
}
|
|
}
|
|
|
|
// ToMarkdown convierte el contenido HTML de la página actual a Markdown
|
|
// usando la biblioteca Turndown.js ejecutada en el navegador
|
|
func (b *Browser) ToMarkdown(ctx context.Context, opts *MarkdownOptions) (string, error) {
|
|
if opts == nil {
|
|
opts = DefaultMarkdownOptions()
|
|
}
|
|
|
|
// Script que incluye Turndown.js y realiza la conversión
|
|
script := fmt.Sprintf(`
|
|
(function() {
|
|
// Librería Turndown inline (versión minificada)
|
|
// https://github.com/mixmark-io/turndown
|
|
const TurndownService = %s;
|
|
|
|
// Configurar Turndown
|
|
const turndownService = new TurndownService({
|
|
headingStyle: 'atx',
|
|
hr: '---',
|
|
bulletListMarker: '-',
|
|
codeBlockStyle: 'fenced',
|
|
fence: '` + "```" + `',
|
|
emDelimiter: '_',
|
|
strongDelimiter: '**',
|
|
linkStyle: 'inlined',
|
|
linkReferenceStyle: 'full'
|
|
});
|
|
|
|
// Configurar reglas personalizadas
|
|
if (!%t) {
|
|
// Eliminar imágenes si no se incluyen
|
|
turndownService.addRule('removeImages', {
|
|
filter: 'img',
|
|
replacement: function() { return ''; }
|
|
});
|
|
}
|
|
|
|
if (!%t) {
|
|
// Convertir enlaces a texto plano si no se incluyen
|
|
turndownService.addRule('removeLinks', {
|
|
filter: 'a',
|
|
replacement: function(content) { return content; }
|
|
});
|
|
}
|
|
|
|
// Obtener HTML a convertir
|
|
let element;
|
|
if ('%s') {
|
|
element = document.querySelector('%s');
|
|
if (!element) {
|
|
throw new Error('Selector not found: %s');
|
|
}
|
|
} else {
|
|
element = document.body;
|
|
}
|
|
|
|
// Convertir a Markdown
|
|
const markdown = turndownService.turndown(element);
|
|
return markdown;
|
|
})();
|
|
`, getTurndownLibrary(), opts.IncludeImages, opts.IncludeLinks,
|
|
opts.Selector, opts.Selector, opts.Selector)
|
|
|
|
result, err := b.Evaluate(ctx, script)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error converting to markdown: %w", err)
|
|
}
|
|
|
|
if result.Value == nil {
|
|
return "", fmt.Errorf("markdown conversion returned null")
|
|
}
|
|
|
|
// Convertir resultado a string
|
|
var markdown string
|
|
if str, ok := result.Value.(string); ok {
|
|
markdown = str
|
|
} else {
|
|
// Intentar serializar como JSON
|
|
jsonBytes, err := json.Marshal(result.Value)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error parsing markdown result: %w", err)
|
|
}
|
|
markdown = string(jsonBytes)
|
|
}
|
|
|
|
return markdown, nil
|
|
}
|
|
|
|
// getTurndownLibrary retorna el código de Turndown.js inline
|
|
// Esta es una versión simplificada. En producción, cargar el archivo completo.
|
|
func getTurndownLibrary() string {
|
|
// Versión muy simplificada de Turndown inline
|
|
// Para producción, considerar cargar desde CDN o bundlear el archivo completo
|
|
return `
|
|
(function() {
|
|
function TurndownService(options) {
|
|
this.options = options || {};
|
|
this.rules = {
|
|
array: []
|
|
};
|
|
this.keep = function(filter) {};
|
|
this.remove = function(filter) {};
|
|
}
|
|
|
|
TurndownService.prototype.addRule = function(key, rule) {
|
|
this.rules.array.push(rule);
|
|
return this;
|
|
};
|
|
|
|
TurndownService.prototype.turndown = function(input) {
|
|
if (typeof input === 'string') {
|
|
const div = document.createElement('div');
|
|
div.innerHTML = input;
|
|
input = div;
|
|
}
|
|
|
|
return this.processNode(input);
|
|
};
|
|
|
|
TurndownService.prototype.processNode = function(node) {
|
|
let markdown = '';
|
|
|
|
if (node.nodeType === Node.TEXT_NODE) {
|
|
return node.textContent.trim();
|
|
}
|
|
|
|
if (node.nodeType !== Node.ELEMENT_NODE) {
|
|
return '';
|
|
}
|
|
|
|
// Procesar según el tag
|
|
const tagName = node.tagName.toLowerCase();
|
|
const children = Array.from(node.childNodes).map(child => this.processNode(child)).join('');
|
|
|
|
switch(tagName) {
|
|
case 'h1':
|
|
return '# ' + children + '\n\n';
|
|
case 'h2':
|
|
return '## ' + children + '\n\n';
|
|
case 'h3':
|
|
return '### ' + children + '\n\n';
|
|
case 'h4':
|
|
return '#### ' + children + '\n\n';
|
|
case 'h5':
|
|
return '##### ' + children + '\n\n';
|
|
case 'h6':
|
|
return '###### ' + children + '\n\n';
|
|
case 'p':
|
|
return children + '\n\n';
|
|
case 'br':
|
|
return ' \n';
|
|
case 'strong':
|
|
case 'b':
|
|
return '**' + children + '**';
|
|
case 'em':
|
|
case 'i':
|
|
return '_' + children + '_';
|
|
case 'a':
|
|
const href = node.getAttribute('href') || '';
|
|
return '[' + children + '](' + href + ')';
|
|
case 'img':
|
|
const src = node.getAttribute('src') || '';
|
|
const alt = node.getAttribute('alt') || '';
|
|
return '';
|
|
case 'ul':
|
|
case 'ol':
|
|
return '\n' + children + '\n';
|
|
case 'li':
|
|
const listMarker = node.parentElement.tagName.toLowerCase() === 'ol' ? '1. ' : '- ';
|
|
return listMarker + children + '\n';
|
|
case 'code':
|
|
if (node.parentElement.tagName.toLowerCase() === 'pre') {
|
|
return children;
|
|
}
|
|
return '` + "`" + `' + children + '` + "`" + `';
|
|
case 'pre':
|
|
return '\n` + "```" + `\n' + children + '\n` + "```" + `\n\n';
|
|
case 'blockquote':
|
|
return '\n> ' + children.split('\n').join('\n> ') + '\n\n';
|
|
case 'hr':
|
|
return '\n---\n\n';
|
|
case 'table':
|
|
return '\n' + this.processTable(node) + '\n';
|
|
case 'script':
|
|
case 'style':
|
|
case 'noscript':
|
|
return '';
|
|
default:
|
|
return children;
|
|
}
|
|
};
|
|
|
|
TurndownService.prototype.processTable = function(table) {
|
|
// Procesamiento básico de tablas
|
|
let markdown = '';
|
|
const rows = table.querySelectorAll('tr');
|
|
|
|
rows.forEach((row, index) => {
|
|
const cells = row.querySelectorAll('th, td');
|
|
const cellContents = Array.from(cells).map(cell => cell.textContent.trim());
|
|
markdown += '| ' + cellContents.join(' | ') + ' |\n';
|
|
|
|
// Agregar separador después del header
|
|
if (index === 0 && cells[0].tagName.toLowerCase() === 'th') {
|
|
markdown += '| ' + cellContents.map(() => '---').join(' | ') + ' |\n';
|
|
}
|
|
});
|
|
|
|
return markdown;
|
|
};
|
|
|
|
return TurndownService;
|
|
})()
|
|
`
|
|
}
|