refactor: highlight code blocks function

This commit is contained in:
httpjamesm 2024-03-09 11:14:39 -05:00
parent 634c7f1ad0
commit 42ad68fe34
No known key found for this signature in database
2 changed files with 37 additions and 15 deletions

View File

@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F
if err != nil { if err != nil {
return question, err return question, err
} }
question.Body = template.HTML(utils.ReplaceImgTags(questionBodyParentHTML)) question.Body = template.HTML(processHTMLBody(questionBodyParentHTML))
// Extract the shortened body description. // Extract the shortened body description.
shortenedBody := strings.TrimSpace(questionBodyParent.Text()) shortenedBody := strings.TrimSpace(questionBodyParent.Text())
@ -212,8 +212,6 @@ func extractMetadata(selection *goquery.Selection, question *types.FilteredQuest
questionAuthorURL += questionAuthor.AttrOr("href", "") questionAuthorURL += questionAuthor.AttrOr("href", "")
question.AuthorURL = questionAuthorURL question.AuthorURL = questionAuthorURL
fmt.Printf("Author name is: %s\n", question.AuthorName)
// Determine if the question has been edited and update author details accordingly. // Determine if the question has been edited and update author details accordingly.
isQuestionEdited := selection.Find("a.js-gps-track").Text() == "edited" isQuestionEdited := selection.Find("a.js-gps-track").Text() == "edited"
if isQuestionEdited { if isQuestionEdited {
@ -247,9 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
answerBodyHTML, _ := answerBody.Html() answerBodyHTML, _ := answerBody.Html()
// Process code blocks within the answer. // Process code blocks within the answer.
processedAnswerBody := processAnswerBody(answerBodyHTML, domain) processedAnswerBody := processHTMLBody(answerBodyHTML)
processedAnswerBody = utils.ReplaceImgTags(processedAnswerBody)
fmt.Println(processedAnswerBody)
answer.Body = template.HTML(html.UnescapeString(processedAnswerBody)) answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))
// Extract author information and timestamp. // Extract author information and timestamp.
@ -261,10 +257,11 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
return answers, nil return answers, nil
} }
// processAnswerBody highlights syntax and processes code blocks within an answer's body. // processHTMLBody highlights syntax and replaces images with proxied versions.
func processAnswerBody(bodyHTML string, domain string) string { func processHTMLBody(bodyHTML string) string {
highlightedBody := utils.HighlightSyntaxViaContent(bodyHTML) highlightedBody := utils.HighlightCodeBlocks(bodyHTML)
return highlightedBody imageProxiedBody := utils.ReplaceImgTags(highlightedBody)
return imageProxiedBody
} }
// extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block. // extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.

View File

@ -12,7 +12,9 @@ import (
"github.com/alecthomas/chroma/styles" "github.com/alecthomas/chroma/styles"
) )
func HighlightSyntaxViaContent(content string) (htmlOut string) { // highlightSyntaxViaContent uses Chroma to lex code content and apply the appropriate tokenizer engine.
// If it can't find one, it defaults to JavaScript syntax highlighting.
func highlightSyntaxViaContent(content string) (htmlOut string) {
content = html.UnescapeString(content) content = html.UnescapeString(content)
fallbackOut := html.EscapeString(content) fallbackOut := html.EscapeString(content)
@ -20,9 +22,7 @@ func HighlightSyntaxViaContent(content string) (htmlOut string) {
// identify the language // identify the language
lexer := lexers.Analyse(content) lexer := lexers.Analyse(content)
if lexer == nil { if lexer == nil {
// unable to identify, so just return the wrapped content lexer = lexers.Get(".js")
htmlOut = fallbackOut
return
} }
style := styles.Get("xcode") style := styles.Get("xcode")
@ -54,7 +54,9 @@ func HighlightSyntaxViaContent(content string) (htmlOut string) {
var preClassRegex = regexp.MustCompile(`(?s)<pre class=".+">`) var preClassRegex = regexp.MustCompile(`(?s)<pre class=".+">`)
func StripBlockTags(content string) (result string) { // stripBlockTags takes an extracted code block from HTML and strips it of its pre and code tags.
// What's returned is just the code.
func stripBlockTags(content string) (result string) {
// strip all "<code>" tags // strip all "<code>" tags
content = strings.Replace(content, "<code>", "", -1) content = strings.Replace(content, "<code>", "", -1)
content = strings.Replace(content, "</code>", "", -1) content = strings.Replace(content, "</code>", "", -1)
@ -68,3 +70,26 @@ func StripBlockTags(content string) (result string) {
return return
} }
var codeBlockRegex = regexp.MustCompile(`(?s)<pre><code>(.*?)<\/code><\/pre>`)
// HighlightCodeBlocks uses both highlightSyntaxViaContent stripCodeBlocks and returns the newly highlighted code HTML.
func HighlightCodeBlocks(html string) string {
// Replace each code block with the highlighted version
highlightedHTML := codeBlockRegex.ReplaceAllStringFunc(html, func(codeBlock string) string {
// Extract the code content from the code block
codeContent := codeBlockRegex.FindStringSubmatch(codeBlock)[1]
codeContent = stripBlockTags(codeContent)
// Highlight the code content
highlightedCode := highlightSyntaxViaContent(codeContent)
// Replace the original code block with the highlighted version
highlightedCodeBlock := "<pre>" + highlightedCode + "</pre>"
return highlightedCodeBlock
})
return highlightedHTML
}