owl-blogs/webmention.go

317 lines
7.9 KiB
Go
Raw Normal View History

2022-08-27 21:01:14 +00:00
package owl
import (
"bytes"
"errors"
2022-09-06 17:47:15 +00:00
"io"
2022-08-27 21:01:14 +00:00
"net/http"
2022-09-04 15:10:40 +00:00
"net/url"
2022-08-27 21:01:14 +00:00
"strings"
2022-09-01 19:53:06 +00:00
"time"
2022-08-27 21:01:14 +00:00
"golang.org/x/net/html"
)
2022-09-04 13:03:16 +00:00
type WebmentionIn struct {
2022-09-01 19:53:06 +00:00
Source string `yaml:"source"`
Title string `yaml:"title"`
ApprovalStatus string `yaml:"approval_status"`
RetrievedAt time.Time `yaml:"retrieved_at"`
2022-09-01 19:34:33 +00:00
}
2022-09-10 12:04:13 +00:00
func (webmention *WebmentionIn) UpdateWith(update WebmentionIn) {
if update.Title != "" {
webmention.Title = update.Title
}
if update.ApprovalStatus != "" {
webmention.ApprovalStatus = update.ApprovalStatus
}
if !update.RetrievedAt.IsZero() {
webmention.RetrievedAt = update.RetrievedAt
}
}
2022-09-04 13:03:16 +00:00
type WebmentionOut struct {
Target string `yaml:"target"`
Supported bool `yaml:"supported"`
ScannedAt time.Time `yaml:"scanned_at"`
LastSentAt time.Time `yaml:"last_sent_at"`
}
2022-09-10 12:04:13 +00:00
func (webmention *WebmentionOut) UpdateWith(update WebmentionOut) {
if update.Supported {
webmention.Supported = update.Supported
}
if !update.ScannedAt.IsZero() {
webmention.ScannedAt = update.ScannedAt
}
if !update.LastSentAt.IsZero() {
webmention.LastSentAt = update.LastSentAt
}
}
2022-09-04 15:10:40 +00:00
type HttpClient interface {
Get(url string) (resp *http.Response, err error)
Post(url, contentType string, body io.Reader) (resp *http.Response, err error)
PostForm(url string, data url.Values) (resp *http.Response, err error)
2022-08-27 21:01:14 +00:00
}
2022-09-04 15:10:40 +00:00
type HtmlParser interface {
ParseHEntry(resp *http.Response) (ParsedHEntry, error)
ParseLinks(resp *http.Response) ([]string, error)
ParseLinksFromString(string) ([]string, error)
GetWebmentionEndpoint(resp *http.Response) (string, error)
2022-11-06 13:17:14 +00:00
GetRedirctUris(resp *http.Response) ([]string, error)
}
type OwlHttpClient = http.Client
2022-08-27 21:01:14 +00:00
2022-09-04 15:10:40 +00:00
type OwlHtmlParser struct{}
2022-08-27 21:01:14 +00:00
type ParsedHEntry struct {
Title string
}
func collectText(n *html.Node, buf *bytes.Buffer) {
2022-08-27 21:01:14 +00:00
if n.Type == html.TextNode {
buf.WriteString(n.Data)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
collectText(c, buf)
}
}
func readResponseBody(resp *http.Response) (string, error) {
defer resp.Body.Close()
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(bodyBytes), nil
}
func (OwlHtmlParser) ParseHEntry(resp *http.Response) (ParsedHEntry, error) {
htmlStr, err := readResponseBody(resp)
2022-09-06 19:01:08 +00:00
if err != nil {
return ParsedHEntry{}, err
}
doc, err := html.Parse(strings.NewReader(htmlStr))
2022-08-27 21:01:14 +00:00
if err != nil {
return ParsedHEntry{}, err
}
var interpretHFeed func(*html.Node, *ParsedHEntry, bool) (ParsedHEntry, error)
interpretHFeed = func(n *html.Node, curr *ParsedHEntry, parent bool) (ParsedHEntry, error) {
attrs := n.Attr
for _, attr := range attrs {
if attr.Key == "class" && strings.Contains(attr.Val, "p-name") {
buf := &bytes.Buffer{}
collectText(n, buf)
curr.Title = buf.String()
return *curr, nil
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
interpretHFeed(c, curr, false)
}
return *curr, nil
}
var findHFeed func(*html.Node) (ParsedHEntry, error)
findHFeed = func(n *html.Node) (ParsedHEntry, error) {
attrs := n.Attr
for _, attr := range attrs {
if attr.Key == "class" && strings.Contains(attr.Val, "h-entry") {
return interpretHFeed(n, &ParsedHEntry{}, true)
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
entry, err := findHFeed(c)
if err == nil {
return entry, nil
}
}
return ParsedHEntry{}, errors.New("no h-entry found")
}
return findHFeed(doc)
}
2022-09-04 13:32:37 +00:00
func (OwlHtmlParser) ParseLinks(resp *http.Response) ([]string, error) {
htmlStr, err := readResponseBody(resp)
if err != nil {
return []string{}, err
}
return OwlHtmlParser{}.ParseLinksFromString(htmlStr)
}
func (OwlHtmlParser) ParseLinksFromString(htmlStr string) ([]string, error) {
doc, err := html.Parse(strings.NewReader(htmlStr))
2022-09-04 13:32:37 +00:00
if err != nil {
return make([]string, 0), err
}
var findLinks func(*html.Node) ([]string, error)
findLinks = func(n *html.Node) ([]string, error) {
links := make([]string, 0)
if n.Type == html.ElementNode && n.Data == "a" {
for _, attr := range n.Attr {
if attr.Key == "href" {
links = append(links, attr.Val)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
childLinks, _ := findLinks(c)
links = append(links, childLinks...)
}
return links, nil
}
return findLinks(doc)
}
2022-09-04 15:10:40 +00:00
func (OwlHtmlParser) GetWebmentionEndpoint(resp *http.Response) (string, error) {
2022-09-06 19:01:08 +00:00
//request url
requestUrl := resp.Request.URL
// Check link headers
2022-09-06 19:17:25 +00:00
for _, linkHeader := range resp.Header["Link"] {
linkHeaderParts := strings.Split(linkHeader, ",")
for _, linkHeaderPart := range linkHeaderParts {
linkHeaderPart = strings.TrimSpace(linkHeaderPart)
params := strings.Split(linkHeaderPart, ";")
if len(params) != 2 {
continue
}
for _, param := range params[1:] {
param = strings.TrimSpace(param)
if strings.Contains(param, "webmention") {
link := strings.Split(params[0], ";")[0]
link = strings.Trim(link, "<>")
linkUrl, err := url.Parse(link)
if err != nil {
return "", err
}
return requestUrl.ResolveReference(linkUrl).String(), nil
}
2022-09-06 19:01:08 +00:00
}
}
}
htmlStr, err := readResponseBody(resp)
2022-09-06 19:01:08 +00:00
if err != nil {
return "", err
}
doc, err := html.Parse(strings.NewReader(htmlStr))
2022-09-04 15:10:40 +00:00
if err != nil {
return "", err
}
var findEndpoint func(*html.Node) (string, error)
findEndpoint = func(n *html.Node) (string, error) {
2022-09-06 17:47:15 +00:00
if n.Type == html.ElementNode && (n.Data == "link" || n.Data == "a") {
2022-09-04 15:10:40 +00:00
for _, attr := range n.Attr {
2022-09-06 19:05:51 +00:00
if attr.Key == "rel" {
vals := strings.Split(attr.Val, " ")
for _, val := range vals {
if val == "webmention" {
for _, attr := range n.Attr {
if attr.Key == "href" {
return attr.Val, nil
}
}
2022-09-04 15:10:40 +00:00
}
}
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
endpoint, err := findEndpoint(c)
if err == nil {
return endpoint, nil
}
}
return "", errors.New("no webmention endpoint found")
}
2022-09-06 19:01:08 +00:00
linkUrlStr, err := findEndpoint(doc)
if err != nil {
return "", err
}
linkUrl, err := url.Parse(linkUrlStr)
if err != nil {
return "", err
}
return requestUrl.ResolveReference(linkUrl).String(), nil
2022-09-04 15:10:40 +00:00
}
2022-11-06 13:17:14 +00:00
func (OwlHtmlParser) GetRedirctUris(resp *http.Response) ([]string, error) {
//request url
requestUrl := resp.Request.URL
htmlStr, err := readResponseBody(resp)
if err != nil {
return make([]string, 0), err
}
doc, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
return make([]string, 0), err
}
var findLinks func(*html.Node) ([]string, error)
2022-11-06 13:36:37 +00:00
// Check link headers
header_links := make([]string, 0)
for _, linkHeader := range resp.Header["Link"] {
linkHeaderParts := strings.Split(linkHeader, ",")
for _, linkHeaderPart := range linkHeaderParts {
linkHeaderPart = strings.TrimSpace(linkHeaderPart)
params := strings.Split(linkHeaderPart, ";")
if len(params) != 2 {
continue
}
for _, param := range params[1:] {
param = strings.TrimSpace(param)
if strings.Contains(param, "redirect_uri") {
link := strings.Split(params[0], ";")[0]
link = strings.Trim(link, "<>")
linkUrl, err := url.Parse(link)
if err == nil {
header_links = append(header_links, requestUrl.ResolveReference(linkUrl).String())
}
}
}
}
}
2022-11-06 13:17:14 +00:00
findLinks = func(n *html.Node) ([]string, error) {
links := make([]string, 0)
if n.Type == html.ElementNode && n.Data == "link" {
// check for rel="redirect_uri"
rel := ""
href := ""
for _, attr := range n.Attr {
if attr.Key == "href" {
href = attr.Val
}
if attr.Key == "rel" {
rel = attr.Val
}
}
if rel == "redirect_uri" {
linkUrl, err := url.Parse(href)
if err == nil {
links = append(links, requestUrl.ResolveReference(linkUrl).String())
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
childLinks, _ := findLinks(c)
links = append(links, childLinks...)
}
return links, nil
}
2022-11-06 13:36:37 +00:00
body_links, err := findLinks(doc)
return append(body_links, header_links...), err
2022-11-06 13:17:14 +00:00
}