Replace StackOverflow Links (#90)
* feat: replace stackoverflow and exchange links * fix: replace stackoverflow.com links with path * feat: run stack overflow link replacer on process * feat: process HTML on comment text
This commit is contained in:
parent
ff66f41f47
commit
e82646635e
4
go.mod
4
go.mod
@ -13,6 +13,7 @@ require (
|
|||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/andybalholm/cascadia v1.3.1 // indirect
|
github.com/andybalholm/cascadia v1.3.1 // indirect
|
||||||
|
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||||
github.com/dlclark/regexp2 v1.7.0 // indirect
|
github.com/dlclark/regexp2 v1.7.0 // indirect
|
||||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||||
github.com/go-playground/locales v0.14.0 // indirect
|
github.com/go-playground/locales v0.14.0 // indirect
|
||||||
@ -25,6 +26,8 @@ require (
|
|||||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||||
github.com/pelletier/go-toml/v2 v2.0.6 // indirect
|
github.com/pelletier/go-toml/v2 v2.0.6 // indirect
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||||
|
github.com/stretchr/testify v1.9.0 // indirect
|
||||||
github.com/ugorji/go/codec v1.2.8 // indirect
|
github.com/ugorji/go/codec v1.2.8 // indirect
|
||||||
golang.org/x/crypto v0.4.0 // indirect
|
golang.org/x/crypto v0.4.0 // indirect
|
||||||
golang.org/x/net v0.7.0 // indirect
|
golang.org/x/net v0.7.0 // indirect
|
||||||
@ -32,4 +35,5 @@ require (
|
|||||||
golang.org/x/text v0.7.0 // indirect
|
golang.org/x/text v0.7.0 // indirect
|
||||||
google.golang.org/protobuf v1.28.1 // indirect
|
google.golang.org/protobuf v1.28.1 // indirect
|
||||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
)
|
)
|
||||||
|
2
go.sum
2
go.sum
@ -72,6 +72,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
|
|||||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||||
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
|
||||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||||
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
github.com/ugorji/go/codec v1.2.8 h1:sgBJS6COt0b/P40VouWKdseidkDgHxYGm0SAglUHfP0=
|
github.com/ugorji/go/codec v1.2.8 h1:sgBJS6COt0b/P40VouWKdseidkDgHxYGm0SAglUHfP0=
|
||||||
github.com/ugorji/go/codec v1.2.8/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
github.com/ugorji/go/codec v1.2.8/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
||||||
golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||||
|
@ -180,7 +180,7 @@ func extractQuestionData(doc *goquery.Document, domain string) (question types.F
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return question, err
|
return question, err
|
||||||
}
|
}
|
||||||
question.Body = template.HTML(processHTMLBody(questionBodyParentHTML))
|
question.Body = template.HTML(utils.ProcessHTMLBody(questionBodyParentHTML))
|
||||||
|
|
||||||
// Extract the shortened body description.
|
// Extract the shortened body description.
|
||||||
shortenedBody := strings.TrimSpace(questionBodyParent.Text())
|
shortenedBody := strings.TrimSpace(questionBodyParent.Text())
|
||||||
@ -245,7 +245,7 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
|
|||||||
answerBodyHTML, _ := answerBody.Html()
|
answerBodyHTML, _ := answerBody.Html()
|
||||||
|
|
||||||
// Process code blocks within the answer.
|
// Process code blocks within the answer.
|
||||||
processedAnswerBody := processHTMLBody(answerBodyHTML)
|
processedAnswerBody := utils.ProcessHTMLBody(answerBodyHTML)
|
||||||
answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))
|
answer.Body = template.HTML(html.UnescapeString(processedAnswerBody))
|
||||||
|
|
||||||
// Extract author information and timestamp.
|
// Extract author information and timestamp.
|
||||||
@ -257,13 +257,6 @@ func extractAnswersData(doc *goquery.Document, domain string) ([]types.FilteredA
|
|||||||
return answers, nil
|
return answers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// processHTMLBody highlights syntax and replaces images with proxied versions.
|
|
||||||
func processHTMLBody(bodyHTML string) string {
|
|
||||||
highlightedBody := utils.HighlightCodeBlocks(bodyHTML)
|
|
||||||
imageProxiedBody := utils.ReplaceImgTags(highlightedBody)
|
|
||||||
return imageProxiedBody
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.
|
// extractAnswerAuthorInfo extracts the author name, URL, and timestamp from an answer block.
|
||||||
// It directly mutates the answer.
|
// It directly mutates the answer.
|
||||||
func extractAnswerAuthorInfo(selection *goquery.Selection, answer *types.FilteredAnswer, domain string) {
|
func extractAnswerAuthorInfo(selection *goquery.Selection, answer *types.FilteredAnswer, domain string) {
|
||||||
|
@ -50,7 +50,7 @@ func FindAndReturnComments(inHtml, domain string, postLayout *goquery.Selection)
|
|||||||
commentTimestamp := commentBody.Find("span.relativetime-clean").Text()
|
commentTimestamp := commentBody.Find("span.relativetime-clean").Text()
|
||||||
|
|
||||||
newFilteredComment := types.FilteredComment{
|
newFilteredComment := types.FilteredComment{
|
||||||
Text: template.HTML(commentCopy),
|
Text: template.HTML(ProcessHTMLBody(commentCopy)),
|
||||||
Timestamp: commentTimestamp,
|
Timestamp: commentTimestamp,
|
||||||
AuthorName: commentAuthor.Text(),
|
AuthorName: commentAuthor.Text(),
|
||||||
AuthorURL: commentAuthorURL,
|
AuthorURL: commentAuthorURL,
|
||||||
|
45
src/utils/links.go
Normal file
45
src/utils/links.go
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
package utils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/url"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// stackOverflowLinkQualifierRegex matches all anchor elements that meet the following conditions:
|
||||||
|
// * must be an anchor element
|
||||||
|
// * the anchor element must have a pathname beginning with /q or /questions
|
||||||
|
// * if there is a host, it must be stackoverflow.com or a subdomain
|
||||||
|
var stackOverflowLinkQualifierRegex = regexp.MustCompile(`<a\s[^>]*href="(?:https?://(?:www\.)?(?:\w+\.)*(?:stackoverflow|stackexchange)\.com)?/(?:q|questions)/[^"]*"[^>]*>.*?</a>`)
|
||||||
|
|
||||||
|
func ReplaceStackOverflowLinks(html string) string {
|
||||||
|
return stackOverflowLinkQualifierRegex.ReplaceAllStringFunc(html, func(match string) string {
|
||||||
|
// Extract the href attribute value from the anchor tag
|
||||||
|
hrefRegex := regexp.MustCompile(`href="([^"]*)"`)
|
||||||
|
hrefMatch := hrefRegex.FindStringSubmatch(match)
|
||||||
|
if len(hrefMatch) < 2 {
|
||||||
|
return match
|
||||||
|
}
|
||||||
|
href := hrefMatch[1]
|
||||||
|
|
||||||
|
// Parse the URL
|
||||||
|
url, err := url.Parse(href)
|
||||||
|
if err != nil {
|
||||||
|
return match
|
||||||
|
}
|
||||||
|
|
||||||
|
newUrl := url.String()
|
||||||
|
|
||||||
|
// Check if the host is a subdomain
|
||||||
|
parts := strings.Split(url.Host, ".")
|
||||||
|
if len(parts) > 2 {
|
||||||
|
// Prepend the subdomain to the path
|
||||||
|
url.Path = "/exchange/" + parts[0] + url.Path
|
||||||
|
}
|
||||||
|
|
||||||
|
newUrl = url.Path + url.RawQuery + url.Fragment
|
||||||
|
|
||||||
|
// Replace the href attribute value in the anchor tag
|
||||||
|
return strings.Replace(match, hrefMatch[1], newUrl, 1)
|
||||||
|
})
|
||||||
|
}
|
49
src/utils/links_test.go
Normal file
49
src/utils/links_test.go
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
package utils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
var sampleInput = `<div class="d-flex fd-column fw-nowrap">
|
||||||
|
<div class="d-flex fw-nowrap">
|
||||||
|
<div class="flex--item wmn0 fl1 lh-lg">
|
||||||
|
<div class="flex--item fl1 lh-lg">
|
||||||
|
<div>
|
||||||
|
<b>This question already has answers here</b>:
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="flex--item mb0 mt4">
|
||||||
|
<a href="/questions/55083952/is-it-possible-to-populate-a-large-set-at-compile-time" dir="ltr">Is it possible to populate a large set at compile time?</a>
|
||||||
|
<span class="question-originals-answer-count">
|
||||||
|
(3 answers)
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="flex--item mb0 mt4">
|
||||||
|
<a href="https://stackoverflow.com/questions/27221504/how-can-you-make-a-safe-static-singleton-in-rust" dir="ltr">How can you make a safe static singleton in Rust?</a>
|
||||||
|
<span class="question-originals-answer-count">
|
||||||
|
(5 answers)
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="flex--item mb0 mt4">
|
||||||
|
<a href="https://security.stackexchange.com/questions/25371/brute-force-an-ssh-login-that-has-only-a-4-letter-password" dir="ltr">Brute-force an SSH-login that has only a 4-letter password</a>
|
||||||
|
<span class="question-originals-answer-count">
|
||||||
|
(9 answers)
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="flex--item mb0 mt8">Closed <span title="2020-01-29 14:28:42Z" class="relativetime">4 years ago</span>.</div>
|
||||||
|
</div>`
|
||||||
|
|
||||||
|
func TestReplaceStackOverflowLinks(t *testing.T) {
|
||||||
|
replacedLinks := ReplaceStackOverflowLinks(sampleInput)
|
||||||
|
|
||||||
|
fmt.Println(replacedLinks)
|
||||||
|
|
||||||
|
assert.False(t, strings.Contains(replacedLinks, "stackoverflow.com"))
|
||||||
|
assert.False(t, strings.Contains(replacedLinks, "stackexchange.com"))
|
||||||
|
}
|
9
src/utils/process.go
Normal file
9
src/utils/process.go
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
package utils
|
||||||
|
|
||||||
|
// ProcessHTMLBody runs HTML through the various preparation functions.
|
||||||
|
func ProcessHTMLBody(bodyHTML string) string {
|
||||||
|
highlightedBody := HighlightCodeBlocks(bodyHTML)
|
||||||
|
imageProxiedBody := ReplaceImgTags(highlightedBody)
|
||||||
|
stackOverflowLinksReplacedBody := ReplaceStackOverflowLinks(imageProxiedBody)
|
||||||
|
return stackOverflowLinksReplacedBody
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user