parent
25fbed4d44
commit
9c6a9cd499
@ -0,0 +1,269 @@ |
||||
package owl |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"io" |
||||
"net/http" |
||||
"net/url" |
||||
"strings" |
||||
|
||||
"golang.org/x/net/html" |
||||
) |
||||
|
||||
type HtmlParser interface { |
||||
ParseHEntry(resp *http.Response) (ParsedHEntry, error) |
||||
ParseLinks(resp *http.Response) ([]string, error) |
||||
ParseLinksFromString(string) ([]string, error) |
||||
GetWebmentionEndpoint(resp *http.Response) (string, error) |
||||
GetRedirctUris(resp *http.Response) ([]string, error) |
||||
} |
||||
|
||||
type OwlHtmlParser struct{} |
||||
|
||||
type ParsedHEntry struct { |
||||
Title string |
||||
} |
||||
|
||||
func collectText(n *html.Node, buf *bytes.Buffer) { |
||||
|
||||
if n.Type == html.TextNode { |
||||
buf.WriteString(n.Data) |
||||
} |
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
collectText(c, buf) |
||||
} |
||||
} |
||||
|
||||
func readResponseBody(resp *http.Response) (string, error) { |
||||
defer resp.Body.Close() |
||||
bodyBytes, err := io.ReadAll(resp.Body) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
return string(bodyBytes), nil |
||||
} |
||||
|
||||
func (OwlHtmlParser) ParseHEntry(resp *http.Response) (ParsedHEntry, error) { |
||||
htmlStr, err := readResponseBody(resp) |
||||
if err != nil { |
||||
return ParsedHEntry{}, err |
||||
} |
||||
doc, err := html.Parse(strings.NewReader(htmlStr)) |
||||
if err != nil { |
||||
return ParsedHEntry{}, err |
||||
} |
||||
|
||||
var interpretHFeed func(*html.Node, *ParsedHEntry, bool) (ParsedHEntry, error) |
||||
interpretHFeed = func(n *html.Node, curr *ParsedHEntry, parent bool) (ParsedHEntry, error) { |
||||
attrs := n.Attr |
||||
for _, attr := range attrs { |
||||
if attr.Key == "class" && strings.Contains(attr.Val, "p-name") { |
||||
buf := &bytes.Buffer{} |
||||
collectText(n, buf) |
||||
curr.Title = buf.String() |
||||
return *curr, nil |
||||
} |
||||
} |
||||
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
interpretHFeed(c, curr, false) |
||||
} |
||||
return *curr, nil |
||||
} |
||||
|
||||
var findHFeed func(*html.Node) (ParsedHEntry, error) |
||||
findHFeed = func(n *html.Node) (ParsedHEntry, error) { |
||||
attrs := n.Attr |
||||
for _, attr := range attrs { |
||||
if attr.Key == "class" && strings.Contains(attr.Val, "h-entry") { |
||||
return interpretHFeed(n, &ParsedHEntry{}, true) |
||||
} |
||||
} |
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
entry, err := findHFeed(c) |
||||
if err == nil { |
||||
return entry, nil |
||||
} |
||||
} |
||||
return ParsedHEntry{}, errors.New("no h-entry found") |
||||
} |
||||
return findHFeed(doc) |
||||
} |
||||
|
||||
func (OwlHtmlParser) ParseLinks(resp *http.Response) ([]string, error) { |
||||
htmlStr, err := readResponseBody(resp) |
||||
if err != nil { |
||||
return []string{}, err |
||||
} |
||||
return OwlHtmlParser{}.ParseLinksFromString(htmlStr) |
||||
} |
||||
|
||||
func (OwlHtmlParser) ParseLinksFromString(htmlStr string) ([]string, error) { |
||||
doc, err := html.Parse(strings.NewReader(htmlStr)) |
||||
if err != nil { |
||||
return make([]string, 0), err |
||||
} |
||||
|
||||
var findLinks func(*html.Node) ([]string, error) |
||||
findLinks = func(n *html.Node) ([]string, error) { |
||||
links := make([]string, 0) |
||||
if n.Type == html.ElementNode && n.Data == "a" { |
||||
for _, attr := range n.Attr { |
||||
if attr.Key == "href" { |
||||
links = append(links, attr.Val) |
||||
} |
||||
} |
||||
} |
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
childLinks, _ := findLinks(c) |
||||
links = append(links, childLinks...) |
||||
} |
||||
return links, nil |
||||
} |
||||
return findLinks(doc) |
||||
} |
||||
|
||||
func (OwlHtmlParser) GetWebmentionEndpoint(resp *http.Response) (string, error) { |
||||
//request url
|
||||
requestUrl := resp.Request.URL |
||||
|
||||
// Check link headers
|
||||
for _, linkHeader := range resp.Header["Link"] { |
||||
linkHeaderParts := strings.Split(linkHeader, ",") |
||||
for _, linkHeaderPart := range linkHeaderParts { |
||||
linkHeaderPart = strings.TrimSpace(linkHeaderPart) |
||||
params := strings.Split(linkHeaderPart, ";") |
||||
if len(params) != 2 { |
||||
continue |
||||
} |
||||
for _, param := range params[1:] { |
||||
param = strings.TrimSpace(param) |
||||
if strings.Contains(param, "webmention") { |
||||
link := strings.Split(params[0], ";")[0] |
||||
link = strings.Trim(link, "<>") |
||||
linkUrl, err := url.Parse(link) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
return requestUrl.ResolveReference(linkUrl).String(), nil |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
htmlStr, err := readResponseBody(resp) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
doc, err := html.Parse(strings.NewReader(htmlStr)) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
|
||||
var findEndpoint func(*html.Node) (string, error) |
||||
findEndpoint = func(n *html.Node) (string, error) { |
||||
if n.Type == html.ElementNode && (n.Data == "link" || n.Data == "a") { |
||||
for _, attr := range n.Attr { |
||||
if attr.Key == "rel" { |
||||
vals := strings.Split(attr.Val, " ") |
||||
for _, val := range vals { |
||||
if val == "webmention" { |
||||
for _, attr := range n.Attr { |
||||
if attr.Key == "href" { |
||||
return attr.Val, nil |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
endpoint, err := findEndpoint(c) |
||||
if err == nil { |
||||
return endpoint, nil |
||||
} |
||||
} |
||||
return "", errors.New("no webmention endpoint found") |
||||
} |
||||
linkUrlStr, err := findEndpoint(doc) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
linkUrl, err := url.Parse(linkUrlStr) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
return requestUrl.ResolveReference(linkUrl).String(), nil |
||||
} |
||||
|
||||
func (OwlHtmlParser) GetRedirctUris(resp *http.Response) ([]string, error) { |
||||
//request url
|
||||
requestUrl := resp.Request.URL |
||||
|
||||
htmlStr, err := readResponseBody(resp) |
||||
if err != nil { |
||||
return make([]string, 0), err |
||||
} |
||||
doc, err := html.Parse(strings.NewReader(htmlStr)) |
||||
if err != nil { |
||||
return make([]string, 0), err |
||||
} |
||||
|
||||
var findLinks func(*html.Node) ([]string, error) |
||||
// Check link headers
|
||||
header_links := make([]string, 0) |
||||
for _, linkHeader := range resp.Header["Link"] { |
||||
linkHeaderParts := strings.Split(linkHeader, ",") |
||||
for _, linkHeaderPart := range linkHeaderParts { |
||||
linkHeaderPart = strings.TrimSpace(linkHeaderPart) |
||||
params := strings.Split(linkHeaderPart, ";") |
||||
if len(params) != 2 { |
||||
continue |
||||
} |
||||
for _, param := range params[1:] { |
||||
param = strings.TrimSpace(param) |
||||
if strings.Contains(param, "redirect_uri") { |
||||
link := strings.Split(params[0], ";")[0] |
||||
link = strings.Trim(link, "<>") |
||||
linkUrl, err := url.Parse(link) |
||||
if err == nil { |
||||
header_links = append(header_links, requestUrl.ResolveReference(linkUrl).String()) |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
findLinks = func(n *html.Node) ([]string, error) { |
||||
links := make([]string, 0) |
||||
if n.Type == html.ElementNode && n.Data == "link" { |
||||
// check for rel="redirect_uri"
|
||||
rel := "" |
||||
href := "" |
||||
|
||||
for _, attr := range n.Attr { |
||||
if attr.Key == "href" { |
||||
href = attr.Val |
||||
} |
||||
if attr.Key == "rel" { |
||||
rel = attr.Val |
||||
} |
||||
} |
||||
if rel == "redirect_uri" { |
||||
linkUrl, err := url.Parse(href) |
||||
if err == nil { |
||||
links = append(links, requestUrl.ResolveReference(linkUrl).String()) |
||||
} |
||||
} |
||||
} |
||||
for c := n.FirstChild; c != nil; c = c.NextSibling { |
||||
childLinks, _ := findLinks(c) |
||||
links = append(links, childLinks...) |
||||
} |
||||
return links, nil |
||||
} |
||||
body_links, err := findLinks(doc) |
||||
return append(body_links, header_links...), err |
||||
} |
@ -0,0 +1,15 @@ |
||||
package owl |
||||
|
||||
import ( |
||||
"io" |
||||
"net/http" |
||||
"net/url" |
||||
) |
||||
|
||||
type HttpClient interface { |
||||
Get(url string) (resp *http.Response, err error) |
||||
Post(url, contentType string, body io.Reader) (resp *http.Response, err error) |
||||
PostForm(url string, data url.Values) (resp *http.Response, err error) |
||||
} |
||||
|
||||
type OwlHttpClient = http.Client |
Loading…
Reference in new issue