From e82646635e83a8c105179a532e9fcdb23a20788b Mon Sep 17 00:00:00 2001 From: httpjamesm <51917118+httpjamesm@users.noreply.github.com> Date: Sat, 9 Mar 2024 12:06:41 -0500 Subject: [PATCH] Replace StackOverflow Links (#90) * feat: replace stackoverflow and exchange links * fix: replace stackoverflow.com links with path * feat: run stack overflow link replacer on process * feat: process HTML on comment text --- go.mod | 4 ++++ go.sum | 2 ++ src/routes/question.go | 11 ++------- src/utils/comments.go | 2 +- src/utils/links.go | 45 +++++++++++++++++++++++++++++++++++++ src/utils/links_test.go | 49 +++++++++++++++++++++++++++++++++++++++++ src/utils/process.go | 9 ++++++++ 7 files changed, 112 insertions(+), 10 deletions(-) create mode 100644 src/utils/links.go create mode 100644 src/utils/links_test.go create mode 100644 src/utils/process.go diff --git a/go.mod b/go.mod index 2545ecc..823e3de 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( require ( github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/dlclark/regexp2 v1.7.0 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-playground/locales v0.14.0 // indirect @@ -25,6 +26,8 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pelletier/go-toml/v2 v2.0.6 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/testify v1.9.0 // indirect github.com/ugorji/go/codec v1.2.8 // indirect golang.org/x/crypto v0.4.0 // indirect golang.org/x/net v0.7.0 // indirect @@ -32,4 +35,5 @@ require ( golang.org/x/text v0.7.0 // indirect google.golang.org/protobuf v1.28.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 017319e..61fbcb1 100644 --- a/go.sum +++ b/go.sum @@ -72,6 +72,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/ugorji/go/codec v1.2.8 h1:sgBJS6COt0b/P40VouWKdseidkDgHxYGm0SAglUHfP0= github.com/ugorji/go/codec v1.2.8/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= diff --git a/src/routes/question.go b/src/routes/question.go index 4cf8161..25293ad 100644 --- a/src/routes/question.go +++ b/src/routes/question.go @@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F if err != nil { return question, err } - question.Body = template.HTML(processHTMLBody(questionBodyParentHTML)) + question.Body = template.HTML(utils.ProcessHTMLBody(questionBodyParentHTML)) // Extract the shortened body description. shortenedBody := strings.TrimSpace(questionBodyParent.Text()) @@ -245,7 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA answerBodyHTML, _ := answerBody.Html() // Process code blocks within the answer. - processedAnswerBody := processHTMLBody(answerBodyHTML) + processedAnswerBody := utils.ProcessHTMLBody(answerBodyHTML) answer.Body = template.HTML(html.UnescapeString(processedAnswerBody)) // Extract author information and timestamp. @@ -257,13 +257,6 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA return answers, nil } -// processHTMLBody highlights syntax and replaces images with proxied versions. -func processHTMLBody(bodyHTML string) string { - highlightedBody := utils.HighlightCodeBlocks(bodyHTML) - imageProxiedBody := utils.ReplaceImgTags(highlightedBody) - return imageProxiedBody -} - // extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block. // It directly mutates the answer. func extractAnswerAuthorInfo(selection *goquery.Selection, answer *types.FilteredAnswer, domain string) { diff --git a/src/utils/comments.go b/src/utils/comments.go index d668115..cc935e4 100644 --- a/src/utils/comments.go +++ b/src/utils/comments.go @@ -50,7 +50,7 @@ func FindAndReturnComments(inHtml, domain string, postLayout *goquery.Selection) commentTimestamp := commentBody.Find("span.relativetime-clean").Text() newFilteredComment := types.FilteredComment{ - Text: template.HTML(commentCopy), + Text: template.HTML(ProcessHTMLBody(commentCopy)), Timestamp: commentTimestamp, AuthorName: commentAuthor.Text(), AuthorURL: commentAuthorURL, diff --git a/src/utils/links.go b/src/utils/links.go new file mode 100644 index 0000000..56ad79a --- /dev/null +++ b/src/utils/links.go @@ -0,0 +1,45 @@ +package utils + +import ( + "net/url" + "regexp" + "strings" +) + +// stackOverflowLinkQualifierRegex matches all anchor elements that meet the following conditions: +// * must be an anchor element +// * the anchor element must have a pathname beginning with /q or /questions +// * if there is a host, it must be stackoverflow.com or a subdomain +var stackOverflowLinkQualifierRegex = regexp.MustCompile(`]*href="(?:https?://(?:www\.)?(?:\w+\.)*(?:stackoverflow|stackexchange)\.com)?/(?:q|questions)/[^"]*"[^>]*>.*?`) + +func ReplaceStackOverflowLinks(html string) string { + return stackOverflowLinkQualifierRegex.ReplaceAllStringFunc(html, func(match string) string { + // Extract the href attribute value from the anchor tag + hrefRegex := regexp.MustCompile(`href="([^"]*)"`) + hrefMatch := hrefRegex.FindStringSubmatch(match) + if len(hrefMatch) < 2 { + return match + } + href := hrefMatch[1] + + // Parse the URL + url, err := url.Parse(href) + if err != nil { + return match + } + + newUrl := url.String() + + // Check if the host is a subdomain + parts := strings.Split(url.Host, ".") + if len(parts) > 2 { + // Prepend the subdomain to the path + url.Path = "/exchange/" + parts[0] + url.Path + } + + newUrl = url.Path + url.RawQuery + url.Fragment + + // Replace the href attribute value in the anchor tag + return strings.Replace(match, hrefMatch[1], newUrl, 1) + }) +} \ No newline at end of file diff --git a/src/utils/links_test.go b/src/utils/links_test.go new file mode 100644 index 0000000..5279fe8 --- /dev/null +++ b/src/utils/links_test.go @@ -0,0 +1,49 @@ +package utils + +import ( + "fmt" + "github.com/stretchr/testify/assert" + "strings" + "testing" +) + +var sampleInput = `
+
+
+
+
+ This question already has answers here: + +
+
+
+
+ + + +
Closed 4 years ago.
+
` + +func TestReplaceStackOverflowLinks(t *testing.T) { + replacedLinks := ReplaceStackOverflowLinks(sampleInput) + + fmt.Println(replacedLinks) + + assert.False(t, strings.Contains(replacedLinks, "stackoverflow.com")) + assert.False(t, strings.Contains(replacedLinks, "stackexchange.com")) +} diff --git a/src/utils/process.go b/src/utils/process.go new file mode 100644 index 0000000..1a00435 --- /dev/null +++ b/src/utils/process.go @@ -0,0 +1,9 @@ +package utils + +// ProcessHTMLBody runs HTML through the various preparation functions. +func ProcessHTMLBody(bodyHTML string) string { + highlightedBody := HighlightCodeBlocks(bodyHTML) + imageProxiedBody := ReplaceImgTags(highlightedBody) + stackOverflowLinksReplacedBody := ReplaceStackOverflowLinks(imageProxiedBody) + return stackOverflowLinksReplacedBody +}