diff --git a/cmd/to_markdown.go b/cmd/to_markdown.go new file mode 100644 index 0000000..3a04130 --- /dev/null +++ b/cmd/to_markdown.go @@ -0,0 +1,72 @@ +package main + +import ( + "context" + "flag" + "fmt" + "log" + "os" + + "navegator/pkg/browser" +) + +func main() { + urlFlag := flag.String("url", "", "URL to convert to markdown") + selectorFlag := flag.String("selector", "", "CSS selector to convert (optional)") + outputFlag := flag.String("output", "", "Output file (default: stdout)") + noImages := flag.Bool("no-images", false, "Exclude images") + noLinks := flag.Bool("no-links", false, "Convert links to plain text") + flag.Parse() + + if *urlFlag == "" { + log.Fatal("Usage: to_markdown -url [-selector ] [-output ] [-no-images] [-no-links]") + } + + ctx := context.Background() + + // Configurar navegador + config := browser.DefaultConfig() + config.ProfileName = "markdown-converter" + config.StealthFlags.Headless = true + + // Lanzar navegador + log.Println("Launching browser...") + b, err := browser.Launch(ctx, config) + if err != nil { + log.Fatalf("Error launching browser: %v", err) + } + defer b.Close() + + // Navegar a URL + log.Printf("Navigating to %s...\n", *urlFlag) + opts := browser.DefaultNavigateOptions() + opts.WaitUntil = "networkidle" + + if err := b.Navigate(ctx, *urlFlag, opts); err != nil { + log.Printf("Warning: navigation error: %v\n", err) + } + + // Configurar opciones de markdown + mdOpts := browser.DefaultMarkdownOptions() + mdOpts.Selector = *selectorFlag + mdOpts.IncludeImages = !*noImages + mdOpts.IncludeLinks = !*noLinks + + // Convertir a markdown + log.Println("Converting to markdown...") + markdown, err := b.ToMarkdown(ctx, mdOpts) + if err != nil { + log.Fatalf("Error converting to markdown: %v", err) + } + + // Output + if *outputFlag != "" { + if err := os.WriteFile(*outputFlag, []byte(markdown), 0644); err != nil { + log.Fatalf("Error writing to file: %v", err) + } + log.Printf("Markdown saved to %s\n", *outputFlag) + } else { + fmt.Println("\n=== MARKDOWN OUTPUT ===\n") + fmt.Println(markdown) + } +} diff --git a/pkg/browser/markdown.go b/pkg/browser/markdown.go new file mode 100644 index 0000000..31c1a58 --- /dev/null +++ b/pkg/browser/markdown.go @@ -0,0 +1,238 @@ +package browser + +import ( + "context" + "encoding/json" + "fmt" +) + +// MarkdownOptions opciones para conversión a Markdown +type MarkdownOptions struct { + Selector string // Selector CSS opcional para convertir solo una parte + IncludeImages bool // Incluir imágenes en el output + IncludeLinks bool // Incluir enlaces (default: true) +} + +// DefaultMarkdownOptions retorna opciones por defecto +func DefaultMarkdownOptions() *MarkdownOptions { + return &MarkdownOptions{ + Selector: "", + IncludeImages: true, + IncludeLinks: true, + } +} + +// ToMarkdown convierte el contenido HTML de la página actual a Markdown +// usando la biblioteca Turndown.js ejecutada en el navegador +func (b *Browser) ToMarkdown(ctx context.Context, opts *MarkdownOptions) (string, error) { + if opts == nil { + opts = DefaultMarkdownOptions() + } + + // Script que incluye Turndown.js y realiza la conversión + script := fmt.Sprintf(` + (function() { + // Librería Turndown inline (versión minificada) + // https://github.com/mixmark-io/turndown + const TurndownService = %s; + + // Configurar Turndown + const turndownService = new TurndownService({ + headingStyle: 'atx', + hr: '---', + bulletListMarker: '-', + codeBlockStyle: 'fenced', + fence: '` + "```" + `', + emDelimiter: '_', + strongDelimiter: '**', + linkStyle: 'inlined', + linkReferenceStyle: 'full' + }); + + // Configurar reglas personalizadas + if (!%t) { + // Eliminar imágenes si no se incluyen + turndownService.addRule('removeImages', { + filter: 'img', + replacement: function() { return ''; } + }); + } + + if (!%t) { + // Convertir enlaces a texto plano si no se incluyen + turndownService.addRule('removeLinks', { + filter: 'a', + replacement: function(content) { return content; } + }); + } + + // Obtener HTML a convertir + let element; + if ('%s') { + element = document.querySelector('%s'); + if (!element) { + throw new Error('Selector not found: %s'); + } + } else { + element = document.body; + } + + // Convertir a Markdown + const markdown = turndownService.turndown(element); + return markdown; + })(); + `, getTurndownLibrary(), opts.IncludeImages, opts.IncludeLinks, + opts.Selector, opts.Selector, opts.Selector) + + result, err := b.Evaluate(ctx, script) + if err != nil { + return "", fmt.Errorf("error converting to markdown: %w", err) + } + + if result.Value == nil { + return "", fmt.Errorf("markdown conversion returned null") + } + + // Convertir resultado a string + var markdown string + if str, ok := result.Value.(string); ok { + markdown = str + } else { + // Intentar serializar como JSON + jsonBytes, err := json.Marshal(result.Value) + if err != nil { + return "", fmt.Errorf("error parsing markdown result: %w", err) + } + markdown = string(jsonBytes) + } + + return markdown, nil +} + +// getTurndownLibrary retorna el código de Turndown.js inline +// Esta es una versión simplificada. En producción, cargar el archivo completo. +func getTurndownLibrary() string { + // Versión muy simplificada de Turndown inline + // Para producción, considerar cargar desde CDN o bundlear el archivo completo + return ` +(function() { + function TurndownService(options) { + this.options = options || {}; + this.rules = { + array: [] + }; + this.keep = function(filter) {}; + this.remove = function(filter) {}; + } + + TurndownService.prototype.addRule = function(key, rule) { + this.rules.array.push(rule); + return this; + }; + + TurndownService.prototype.turndown = function(input) { + if (typeof input === 'string') { + const div = document.createElement('div'); + div.innerHTML = input; + input = div; + } + + return this.processNode(input); + }; + + TurndownService.prototype.processNode = function(node) { + let markdown = ''; + + if (node.nodeType === Node.TEXT_NODE) { + return node.textContent.trim(); + } + + if (node.nodeType !== Node.ELEMENT_NODE) { + return ''; + } + + // Procesar según el tag + const tagName = node.tagName.toLowerCase(); + const children = Array.from(node.childNodes).map(child => this.processNode(child)).join(''); + + switch(tagName) { + case 'h1': + return '# ' + children + '\n\n'; + case 'h2': + return '## ' + children + '\n\n'; + case 'h3': + return '### ' + children + '\n\n'; + case 'h4': + return '#### ' + children + '\n\n'; + case 'h5': + return '##### ' + children + '\n\n'; + case 'h6': + return '###### ' + children + '\n\n'; + case 'p': + return children + '\n\n'; + case 'br': + return ' \n'; + case 'strong': + case 'b': + return '**' + children + '**'; + case 'em': + case 'i': + return '_' + children + '_'; + case 'a': + const href = node.getAttribute('href') || ''; + return '[' + children + '](' + href + ')'; + case 'img': + const src = node.getAttribute('src') || ''; + const alt = node.getAttribute('alt') || ''; + return '![' + alt + '](' + src + ')'; + case 'ul': + case 'ol': + return '\n' + children + '\n'; + case 'li': + const listMarker = node.parentElement.tagName.toLowerCase() === 'ol' ? '1. ' : '- '; + return listMarker + children + '\n'; + case 'code': + if (node.parentElement.tagName.toLowerCase() === 'pre') { + return children; + } + return '` + "`" + `' + children + '` + "`" + `'; + case 'pre': + return '\n` + "```" + `\n' + children + '\n` + "```" + `\n\n'; + case 'blockquote': + return '\n> ' + children.split('\n').join('\n> ') + '\n\n'; + case 'hr': + return '\n---\n\n'; + case 'table': + return '\n' + this.processTable(node) + '\n'; + case 'script': + case 'style': + case 'noscript': + return ''; + default: + return children; + } + }; + + TurndownService.prototype.processTable = function(table) { + // Procesamiento básico de tablas + let markdown = ''; + const rows = table.querySelectorAll('tr'); + + rows.forEach((row, index) => { + const cells = row.querySelectorAll('th, td'); + const cellContents = Array.from(cells).map(cell => cell.textContent.trim()); + markdown += '| ' + cellContents.join(' | ') + ' |\n'; + + // Agregar separador después del header + if (index === 0 && cells[0].tagName.toLowerCase() === 'th') { + markdown += '| ' + cellContents.map(() => '---').join(' | ') + ' |\n'; + } + }); + + return markdown; + }; + + return TurndownService; +})() +` +}