Replace StackOverflow Links (#90)

* feat: replace stackoverflow and exchange links * fix: replace stackoverflow.com links with path * feat: run stack overflow link replacer on process * feat: process HTML on comment text
2024-03-09 12:06:41 -05:00
parent ff66f41f47
commit e82646635e
7 changed files with 112 additions and 10 deletions
--- a/src/routes/question.go
+++ b/src/routes/question.go
@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F
 	if err != nil {
 		return question, err
 	}
-	question.Body = template.HTML(processHTMLBody(questionBodyParentHTML))
+	question.Body = template.HTML(utils.ProcessHTMLBody(questionBodyParentHTML))

 	// Extract the shortened body description.
 	shortenedBody := strings.TrimSpace(questionBodyParent.Text())
@ -245,7 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
 		answerBodyHTML, _ := answerBody.Html()

 		// Process code blocks within the answer.
-		processedAnswerBody := processHTMLBody(answerBodyHTML)
+		processedAnswerBody := utils.ProcessHTMLBody(answerBodyHTML)
 		answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))

 		// Extract author information and timestamp.
@ -257,13 +257,6 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
 	return answers, nil
 }

-// processHTMLBody highlights syntax and replaces images with proxied versions.
-func processHTMLBody(bodyHTML string) string {
-	highlightedBody := utils.HighlightCodeBlocks(bodyHTML)
-	imageProxiedBody := utils.ReplaceImgTags(highlightedBody)
-	return imageProxiedBody
-}
-
 // extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.
 // It directly mutates the answer.
 func extractAnswerAuthorInfo(selection *goquery.Selection, answer *types.FilteredAnswer, domain string) {
--- a/src/utils/comments.go
+++ b/src/utils/comments.go
@ -50,7 +50,7 @@ func FindAndReturnComments(inHtml, domain string, postLayout *goquery.Selection)
 		commentTimestamp := commentBody.Find("span.relativetime-clean").Text()

 		newFilteredComment := types.FilteredComment{
-			Text:       template.HTML(commentCopy),
+			Text:       template.HTML(ProcessHTMLBody(commentCopy)),
 			Timestamp:  commentTimestamp,
 			AuthorName: commentAuthor.Text(),
 			AuthorURL:  commentAuthorURL,
--- a/src/utils/links.go
+++ b/src/utils/links.go
@ -0,0 +1,45 @@
+package utils
+
+import (
+	"net/url"
+	"regexp"
+	"strings"
+)
+
+// stackOverflowLinkQualifierRegex matches all anchor elements that meet the following conditions:
+// * must be an anchor element
+// * the anchor element must have a pathname beginning with /q or /questions
+// * if there is a host, it must be stackoverflow.com or a subdomain
+var stackOverflowLinkQualifierRegex = regexp.MustCompile(`<a\s[^>]*href="(?:https?://(?:www\.)?(?:\w+\.)*(?:stackoverflow|stackexchange)\.com)?/(?:q|questions)/[^"]*"[^>]*>.*?</a>`)
+
+func ReplaceStackOverflowLinks(html string) string {
+	return stackOverflowLinkQualifierRegex.ReplaceAllStringFunc(html, func(match string) string {
+		// Extract the href attribute value from the anchor tag
+		hrefRegex := regexp.MustCompile(`href="([^"]*)"`)
+		hrefMatch := hrefRegex.FindStringSubmatch(match)
+		if len(hrefMatch) < 2 {
+			return match
+		}
+		href := hrefMatch[1]
+
+		// Parse the URL
+		url, err := url.Parse(href)
+		if err != nil {
+			return match
+		}
+
+		newUrl := url.String()
+
+		// Check if the host is a subdomain
+		parts := strings.Split(url.Host, ".")
+		if len(parts) > 2 {
+			// Prepend the subdomain to the path
+			url.Path = "/exchange/" + parts[0] + url.Path
+		}
+
+		newUrl = url.Path + url.RawQuery + url.Fragment
+
+		// Replace the href attribute value in the anchor tag
+		return strings.Replace(match, hrefMatch[1], newUrl, 1)
+	})
+}
--- a/src/utils/links_test.go
+++ b/src/utils/links_test.go
@ -0,0 +1,49 @@
+package utils
+
+import (
+	"fmt"
+	"github.com/stretchr/testify/assert"
+	"strings"
+	"testing"
+)
+
+var sampleInput = `<div class="d-flex fd-column fw-nowrap">
+	<div class="d-flex fw-nowrap">
+		<div class="flex--item wmn0 fl1 lh-lg">
+			<div class="flex--item fl1 lh-lg">
+					<div>
+						<b>This question already has answers here</b>:
+
+					</div>
+			</div>
+		</div>
+	</div>
+			<div class="flex--item mb0 mt4">
+				<a href="/questions/55083952/is-it-possible-to-populate-a-large-set-at-compile-time" dir="ltr">Is it possible to populate a large set at compile time?</a>
+					<span class="question-originals-answer-count">
+						(3 answers)
+					</span>
+			</div>
+			<div class="flex--item mb0 mt4">
+				<a href="https://stackoverflow.com/questions/27221504/how-can-you-make-a-safe-static-singleton-in-rust" dir="ltr">How can you make a safe static singleton in Rust?</a>
+					<span class="question-originals-answer-count">
+						(5 answers)
+					</span>
+			</div>
+			<div class="flex--item mb0 mt4">
+				<a href="https://security.stackexchange.com/questions/25371/brute-force-an-ssh-login-that-has-only-a-4-letter-password" dir="ltr">Brute-force an SSH-login that has only a 4-letter password</a>
+					<span class="question-originals-answer-count">
+						(9 answers)
+					</span>
+			</div>
+		<div class="flex--item mb0 mt8">Closed <span title="2020-01-29 14:28:42Z" class="relativetime">4 years ago</span>.</div>
+</div>`
+
+func TestReplaceStackOverflowLinks(t *testing.T) {
+	replacedLinks := ReplaceStackOverflowLinks(sampleInput)
+
+	fmt.Println(replacedLinks)
+
+	assert.False(t, strings.Contains(replacedLinks, "stackoverflow.com"))
+	assert.False(t, strings.Contains(replacedLinks, "stackexchange.com"))
+}
--- a/src/utils/process.go
+++ b/src/utils/process.go
@ -0,0 +1,9 @@
+package utils
+
+// ProcessHTMLBody runs HTML through the various preparation functions.
+func ProcessHTMLBody(bodyHTML string) string {
+	highlightedBody := HighlightCodeBlocks(bodyHTML)
+	imageProxiedBody := ReplaceImgTags(highlightedBody)
+	stackOverflowLinksReplacedBody := ReplaceStackOverflowLinks(imageProxiedBody)
+	return stackOverflowLinksReplacedBody
+}