From 9c6a9cd499726b6e91e07a32e59f4200cdf3af82 Mon Sep 17 00:00:00 2001 From: Niko Abeler Date: Tue, 8 Nov 2022 19:24:04 +0100 Subject: [PATCH] split webmention into html and http --- html.go | 269 +++++++++++++++++++++++++++++++++++++++++++++++++ http.go | 15 +++ webmention.go | 273 -------------------------------------------------- 3 files changed, 284 insertions(+), 273 deletions(-) create mode 100644 html.go create mode 100644 http.go diff --git a/html.go b/html.go new file mode 100644 index 0000000..7a86643 --- /dev/null +++ b/html.go @@ -0,0 +1,269 @@ +package owl + +import ( + "bytes" + "errors" + "io" + "net/http" + "net/url" + "strings" + + "golang.org/x/net/html" +) + +type HtmlParser interface { + ParseHEntry(resp *http.Response) (ParsedHEntry, error) + ParseLinks(resp *http.Response) ([]string, error) + ParseLinksFromString(string) ([]string, error) + GetWebmentionEndpoint(resp *http.Response) (string, error) + GetRedirctUris(resp *http.Response) ([]string, error) +} + +type OwlHtmlParser struct{} + +type ParsedHEntry struct { + Title string +} + +func collectText(n *html.Node, buf *bytes.Buffer) { + + if n.Type == html.TextNode { + buf.WriteString(n.Data) + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + collectText(c, buf) + } +} + +func readResponseBody(resp *http.Response) (string, error) { + defer resp.Body.Close() + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return "", err + } + return string(bodyBytes), nil +} + +func (OwlHtmlParser) ParseHEntry(resp *http.Response) (ParsedHEntry, error) { + htmlStr, err := readResponseBody(resp) + if err != nil { + return ParsedHEntry{}, err + } + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return ParsedHEntry{}, err + } + + var interpretHFeed func(*html.Node, *ParsedHEntry, bool) (ParsedHEntry, error) + interpretHFeed = func(n *html.Node, curr *ParsedHEntry, parent bool) (ParsedHEntry, error) { + attrs := n.Attr + for _, attr := range attrs { + if attr.Key == "class" && strings.Contains(attr.Val, "p-name") { + buf := &bytes.Buffer{} + collectText(n, buf) + curr.Title = buf.String() + return *curr, nil + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + interpretHFeed(c, curr, false) + } + return *curr, nil + } + + var findHFeed func(*html.Node) (ParsedHEntry, error) + findHFeed = func(n *html.Node) (ParsedHEntry, error) { + attrs := n.Attr + for _, attr := range attrs { + if attr.Key == "class" && strings.Contains(attr.Val, "h-entry") { + return interpretHFeed(n, &ParsedHEntry{}, true) + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + entry, err := findHFeed(c) + if err == nil { + return entry, nil + } + } + return ParsedHEntry{}, errors.New("no h-entry found") + } + return findHFeed(doc) +} + +func (OwlHtmlParser) ParseLinks(resp *http.Response) ([]string, error) { + htmlStr, err := readResponseBody(resp) + if err != nil { + return []string{}, err + } + return OwlHtmlParser{}.ParseLinksFromString(htmlStr) +} + +func (OwlHtmlParser) ParseLinksFromString(htmlStr string) ([]string, error) { + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return make([]string, 0), err + } + + var findLinks func(*html.Node) ([]string, error) + findLinks = func(n *html.Node) ([]string, error) { + links := make([]string, 0) + if n.Type == html.ElementNode && n.Data == "a" { + for _, attr := range n.Attr { + if attr.Key == "href" { + links = append(links, attr.Val) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + childLinks, _ := findLinks(c) + links = append(links, childLinks...) + } + return links, nil + } + return findLinks(doc) +} + +func (OwlHtmlParser) GetWebmentionEndpoint(resp *http.Response) (string, error) { + //request url + requestUrl := resp.Request.URL + + // Check link headers + for _, linkHeader := range resp.Header["Link"] { + linkHeaderParts := strings.Split(linkHeader, ",") + for _, linkHeaderPart := range linkHeaderParts { + linkHeaderPart = strings.TrimSpace(linkHeaderPart) + params := strings.Split(linkHeaderPart, ";") + if len(params) != 2 { + continue + } + for _, param := range params[1:] { + param = strings.TrimSpace(param) + if strings.Contains(param, "webmention") { + link := strings.Split(params[0], ";")[0] + link = strings.Trim(link, "<>") + linkUrl, err := url.Parse(link) + if err != nil { + return "", err + } + return requestUrl.ResolveReference(linkUrl).String(), nil + } + } + } + } + + htmlStr, err := readResponseBody(resp) + if err != nil { + return "", err + } + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return "", err + } + + var findEndpoint func(*html.Node) (string, error) + findEndpoint = func(n *html.Node) (string, error) { + if n.Type == html.ElementNode && (n.Data == "link" || n.Data == "a") { + for _, attr := range n.Attr { + if attr.Key == "rel" { + vals := strings.Split(attr.Val, " ") + for _, val := range vals { + if val == "webmention" { + for _, attr := range n.Attr { + if attr.Key == "href" { + return attr.Val, nil + } + } + } + } + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + endpoint, err := findEndpoint(c) + if err == nil { + return endpoint, nil + } + } + return "", errors.New("no webmention endpoint found") + } + linkUrlStr, err := findEndpoint(doc) + if err != nil { + return "", err + } + linkUrl, err := url.Parse(linkUrlStr) + if err != nil { + return "", err + } + return requestUrl.ResolveReference(linkUrl).String(), nil +} + +func (OwlHtmlParser) GetRedirctUris(resp *http.Response) ([]string, error) { + //request url + requestUrl := resp.Request.URL + + htmlStr, err := readResponseBody(resp) + if err != nil { + return make([]string, 0), err + } + doc, err := html.Parse(strings.NewReader(htmlStr)) + if err != nil { + return make([]string, 0), err + } + + var findLinks func(*html.Node) ([]string, error) + // Check link headers + header_links := make([]string, 0) + for _, linkHeader := range resp.Header["Link"] { + linkHeaderParts := strings.Split(linkHeader, ",") + for _, linkHeaderPart := range linkHeaderParts { + linkHeaderPart = strings.TrimSpace(linkHeaderPart) + params := strings.Split(linkHeaderPart, ";") + if len(params) != 2 { + continue + } + for _, param := range params[1:] { + param = strings.TrimSpace(param) + if strings.Contains(param, "redirect_uri") { + link := strings.Split(params[0], ";")[0] + link = strings.Trim(link, "<>") + linkUrl, err := url.Parse(link) + if err == nil { + header_links = append(header_links, requestUrl.ResolveReference(linkUrl).String()) + } + } + } + } + } + + findLinks = func(n *html.Node) ([]string, error) { + links := make([]string, 0) + if n.Type == html.ElementNode && n.Data == "link" { + // check for rel="redirect_uri" + rel := "" + href := "" + + for _, attr := range n.Attr { + if attr.Key == "href" { + href = attr.Val + } + if attr.Key == "rel" { + rel = attr.Val + } + } + if rel == "redirect_uri" { + linkUrl, err := url.Parse(href) + if err == nil { + links = append(links, requestUrl.ResolveReference(linkUrl).String()) + } + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + childLinks, _ := findLinks(c) + links = append(links, childLinks...) + } + return links, nil + } + body_links, err := findLinks(doc) + return append(body_links, header_links...), err +} diff --git a/http.go b/http.go new file mode 100644 index 0000000..7a2f106 --- /dev/null +++ b/http.go @@ -0,0 +1,15 @@ +package owl + +import ( + "io" + "net/http" + "net/url" +) + +type HttpClient interface { + Get(url string) (resp *http.Response, err error) + Post(url, contentType string, body io.Reader) (resp *http.Response, err error) + PostForm(url string, data url.Values) (resp *http.Response, err error) +} + +type OwlHttpClient = http.Client diff --git a/webmention.go b/webmention.go index 046af1c..e0e0f94 100644 --- a/webmention.go +++ b/webmention.go @@ -1,15 +1,7 @@ package owl import ( - "bytes" - "errors" - "io" - "net/http" - "net/url" - "strings" "time" - - "golang.org/x/net/html" ) type WebmentionIn struct { @@ -49,268 +41,3 @@ func (webmention *WebmentionOut) UpdateWith(update WebmentionOut) { webmention.LastSentAt = update.LastSentAt } } - -type HttpClient interface { - Get(url string) (resp *http.Response, err error) - Post(url, contentType string, body io.Reader) (resp *http.Response, err error) - PostForm(url string, data url.Values) (resp *http.Response, err error) -} - -type HtmlParser interface { - ParseHEntry(resp *http.Response) (ParsedHEntry, error) - ParseLinks(resp *http.Response) ([]string, error) - ParseLinksFromString(string) ([]string, error) - GetWebmentionEndpoint(resp *http.Response) (string, error) - GetRedirctUris(resp *http.Response) ([]string, error) -} - -type OwlHttpClient = http.Client - -type OwlHtmlParser struct{} - -type ParsedHEntry struct { - Title string -} - -func collectText(n *html.Node, buf *bytes.Buffer) { - - if n.Type == html.TextNode { - buf.WriteString(n.Data) - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - collectText(c, buf) - } -} - -func readResponseBody(resp *http.Response) (string, error) { - defer resp.Body.Close() - bodyBytes, err := io.ReadAll(resp.Body) - if err != nil { - return "", err - } - return string(bodyBytes), nil -} - -func (OwlHtmlParser) ParseHEntry(resp *http.Response) (ParsedHEntry, error) { - htmlStr, err := readResponseBody(resp) - if err != nil { - return ParsedHEntry{}, err - } - doc, err := html.Parse(strings.NewReader(htmlStr)) - if err != nil { - return ParsedHEntry{}, err - } - - var interpretHFeed func(*html.Node, *ParsedHEntry, bool) (ParsedHEntry, error) - interpretHFeed = func(n *html.Node, curr *ParsedHEntry, parent bool) (ParsedHEntry, error) { - attrs := n.Attr - for _, attr := range attrs { - if attr.Key == "class" && strings.Contains(attr.Val, "p-name") { - buf := &bytes.Buffer{} - collectText(n, buf) - curr.Title = buf.String() - return *curr, nil - } - } - - for c := n.FirstChild; c != nil; c = c.NextSibling { - interpretHFeed(c, curr, false) - } - return *curr, nil - } - - var findHFeed func(*html.Node) (ParsedHEntry, error) - findHFeed = func(n *html.Node) (ParsedHEntry, error) { - attrs := n.Attr - for _, attr := range attrs { - if attr.Key == "class" && strings.Contains(attr.Val, "h-entry") { - return interpretHFeed(n, &ParsedHEntry{}, true) - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - entry, err := findHFeed(c) - if err == nil { - return entry, nil - } - } - return ParsedHEntry{}, errors.New("no h-entry found") - } - return findHFeed(doc) -} - -func (OwlHtmlParser) ParseLinks(resp *http.Response) ([]string, error) { - htmlStr, err := readResponseBody(resp) - if err != nil { - return []string{}, err - } - return OwlHtmlParser{}.ParseLinksFromString(htmlStr) -} - -func (OwlHtmlParser) ParseLinksFromString(htmlStr string) ([]string, error) { - doc, err := html.Parse(strings.NewReader(htmlStr)) - if err != nil { - return make([]string, 0), err - } - - var findLinks func(*html.Node) ([]string, error) - findLinks = func(n *html.Node) ([]string, error) { - links := make([]string, 0) - if n.Type == html.ElementNode && n.Data == "a" { - for _, attr := range n.Attr { - if attr.Key == "href" { - links = append(links, attr.Val) - } - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - childLinks, _ := findLinks(c) - links = append(links, childLinks...) - } - return links, nil - } - return findLinks(doc) -} - -func (OwlHtmlParser) GetWebmentionEndpoint(resp *http.Response) (string, error) { - //request url - requestUrl := resp.Request.URL - - // Check link headers - for _, linkHeader := range resp.Header["Link"] { - linkHeaderParts := strings.Split(linkHeader, ",") - for _, linkHeaderPart := range linkHeaderParts { - linkHeaderPart = strings.TrimSpace(linkHeaderPart) - params := strings.Split(linkHeaderPart, ";") - if len(params) != 2 { - continue - } - for _, param := range params[1:] { - param = strings.TrimSpace(param) - if strings.Contains(param, "webmention") { - link := strings.Split(params[0], ";")[0] - link = strings.Trim(link, "<>") - linkUrl, err := url.Parse(link) - if err != nil { - return "", err - } - return requestUrl.ResolveReference(linkUrl).String(), nil - } - } - } - } - - htmlStr, err := readResponseBody(resp) - if err != nil { - return "", err - } - doc, err := html.Parse(strings.NewReader(htmlStr)) - if err != nil { - return "", err - } - - var findEndpoint func(*html.Node) (string, error) - findEndpoint = func(n *html.Node) (string, error) { - if n.Type == html.ElementNode && (n.Data == "link" || n.Data == "a") { - for _, attr := range n.Attr { - if attr.Key == "rel" { - vals := strings.Split(attr.Val, " ") - for _, val := range vals { - if val == "webmention" { - for _, attr := range n.Attr { - if attr.Key == "href" { - return attr.Val, nil - } - } - } - } - } - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - endpoint, err := findEndpoint(c) - if err == nil { - return endpoint, nil - } - } - return "", errors.New("no webmention endpoint found") - } - linkUrlStr, err := findEndpoint(doc) - if err != nil { - return "", err - } - linkUrl, err := url.Parse(linkUrlStr) - if err != nil { - return "", err - } - return requestUrl.ResolveReference(linkUrl).String(), nil -} - -func (OwlHtmlParser) GetRedirctUris(resp *http.Response) ([]string, error) { - //request url - requestUrl := resp.Request.URL - - htmlStr, err := readResponseBody(resp) - if err != nil { - return make([]string, 0), err - } - doc, err := html.Parse(strings.NewReader(htmlStr)) - if err != nil { - return make([]string, 0), err - } - - var findLinks func(*html.Node) ([]string, error) - // Check link headers - header_links := make([]string, 0) - for _, linkHeader := range resp.Header["Link"] { - linkHeaderParts := strings.Split(linkHeader, ",") - for _, linkHeaderPart := range linkHeaderParts { - linkHeaderPart = strings.TrimSpace(linkHeaderPart) - params := strings.Split(linkHeaderPart, ";") - if len(params) != 2 { - continue - } - for _, param := range params[1:] { - param = strings.TrimSpace(param) - if strings.Contains(param, "redirect_uri") { - link := strings.Split(params[0], ";")[0] - link = strings.Trim(link, "<>") - linkUrl, err := url.Parse(link) - if err == nil { - header_links = append(header_links, requestUrl.ResolveReference(linkUrl).String()) - } - } - } - } - } - - findLinks = func(n *html.Node) ([]string, error) { - links := make([]string, 0) - if n.Type == html.ElementNode && n.Data == "link" { - // check for rel="redirect_uri" - rel := "" - href := "" - - for _, attr := range n.Attr { - if attr.Key == "href" { - href = attr.Val - } - if attr.Key == "rel" { - rel = attr.Val - } - } - if rel == "redirect_uri" { - linkUrl, err := url.Parse(href) - if err == nil { - links = append(links, requestUrl.ResolveReference(linkUrl).String()) - } - } - } - for c := n.FirstChild; c != nil; c = c.NextSibling { - childLinks, _ := findLinks(c) - links = append(links, childLinks...) - } - return links, nil - } - body_links, err := findLinks(doc) - return append(body_links, header_links...), err -}