refactor: highlight code blocks function

This commit is contained in:
httpjamesm 2024-03-09 11:14:39 -05:00
parent 634c7f1ad0
commit 42ad68fe34
No known key found for this signature in database
2 changed files with 37 additions and 15 deletions

View File

@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F
if err != nil {
return question, err
}
question.Body = template.HTML(utils.ReplaceImgTags(questionBodyParentHTML))
question.Body = template.HTML(processHTMLBody(questionBodyParentHTML))
// Extract the shortened body description.
shortenedBody := strings.TrimSpace(questionBodyParent.Text())
@ -212,8 +212,6 @@ func extractMetadata(selection *goquery.Selection, question *types.FilteredQuest
questionAuthorURL += questionAuthor.AttrOr("href", "")
question.AuthorURL = questionAuthorURL
fmt.Printf("Author name is: %s\n", question.AuthorName)
// Determine if the question has been edited and update author details accordingly.
isQuestionEdited := selection.Find("a.js-gps-track").Text() == "edited"
if isQuestionEdited {
@ -247,9 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
answerBodyHTML, _ := answerBody.Html()
// Process code blocks within the answer.
processedAnswerBody := processAnswerBody(answerBodyHTML, domain)
processedAnswerBody = utils.ReplaceImgTags(processedAnswerBody)
fmt.Println(processedAnswerBody)
processedAnswerBody := processHTMLBody(answerBodyHTML)
answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))
// Extract author information and timestamp.
@ -261,10 +257,11 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
return answers, nil
}
// processAnswerBody highlights syntax and processes code blocks within an answer's body.
func processAnswerBody(bodyHTML string, domain string) string {
highlightedBody := utils.HighlightSyntaxViaContent(bodyHTML)
return highlightedBody
// processHTMLBody highlights syntax and replaces images with proxied versions.
func processHTMLBody(bodyHTML string) string {
highlightedBody := utils.HighlightCodeBlocks(bodyHTML)
imageProxiedBody := utils.ReplaceImgTags(highlightedBody)
return imageProxiedBody
}
// extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.

View File

@ -12,7 +12,9 @@ import (
"github.com/alecthomas/chroma/styles"
)
func HighlightSyntaxViaContent(content string) (htmlOut string) {
// highlightSyntaxViaContent uses Chroma to lex code content and apply the appropriate tokenizer engine.
// If it can't find one, it defaults to JavaScript syntax highlighting.
func highlightSyntaxViaContent(content string) (htmlOut string) {
content = html.UnescapeString(content)
fallbackOut := html.EscapeString(content)
@ -20,9 +22,7 @@ func HighlightSyntaxViaContent(content string) (htmlOut string) {
// identify the language
lexer := lexers.Analyse(content)
if lexer == nil {
// unable to identify, so just return the wrapped content
htmlOut = fallbackOut
return
lexer = lexers.Get(".js")
}
style := styles.Get("xcode")
@ -54,7 +54,9 @@ func HighlightSyntaxViaContent(content string) (htmlOut string) {
var preClassRegex = regexp.MustCompile(`(?s)<pre class=".+">`)
func StripBlockTags(content string) (result string) {
// stripBlockTags takes an extracted code block from HTML and strips it of its pre and code tags.
// What's returned is just the code.
func stripBlockTags(content string) (result string) {
// strip all "<code>" tags
content = strings.Replace(content, "<code>", "", -1)
content = strings.Replace(content, "</code>", "", -1)
@ -68,3 +70,26 @@ func StripBlockTags(content string) (result string) {
return
}
var codeBlockRegex = regexp.MustCompile(`(?s)<pre><code>(.*?)<\/code><\/pre>`)
// HighlightCodeBlocks uses both highlightSyntaxViaContent stripCodeBlocks and returns the newly highlighted code HTML.
func HighlightCodeBlocks(html string) string {
// Replace each code block with the highlighted version
highlightedHTML := codeBlockRegex.ReplaceAllStringFunc(html, func(codeBlock string) string {
// Extract the code content from the code block
codeContent := codeBlockRegex.FindStringSubmatch(codeBlock)[1]
codeContent = stripBlockTags(codeContent)
// Highlight the code content
highlightedCode := highlightSyntaxViaContent(codeContent)
// Replace the original code block with the highlighted version
highlightedCodeBlock := "<pre>" + highlightedCode + "</pre>"
return highlightedCodeBlock
})
return highlightedHTML
}