Making a google scraper to track SEO rankings

Starting the project
Prepare our parameters
Implement the steps
Last updated

Last updated
ftr login -a automation_id -b devftr watchpackage seoscraper
import basicgroupsprotocol "github.com/futura-platform/protocol/basicgroups/protocol"
type searchTerm string
// Equals implements basicgroupsprotocol.Parsable.
func (s searchTerm) Equals(s2 searchTerm) bool {
return s == s2
}
// GetGroupConfig implements basicgroupsprotocol.Parsable.
func (searchTerm) GetGroupConfig() basicgroupsprotocol.GroupConfig {
return basicgroupsprotocol.GroupConfig{
EntryTypeSingular: "Search Term",
EntryTypePlural: "Search Terms",
EntryPlaceholder: "Enter a search term",
Icon: `<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor" class="size-6">
<path stroke-linecap="round" stroke-linejoin="round" d="m15.75 15.75-2.489-2.489m0 0a3.375 3.375 0 1 0-4.773-4.773 3.375 3.375 0 0 0 4.774 4.774ZM21 12a9 9 0 1 1-18 0 9 9 0 0 1 18 0Z" />
</svg>
`,
}
}
// ParseEntry implements basicgroupsprotocol.Parsable.
func (searchTerm) ParseEntry(s string) (searchTerm, error) {
return searchTerm(s), nil
}
// SerializeEntry implements basicgroupsprotocol.Parsable.
func (s searchTerm) SerializeEntry() string {
return string(s)
}type Params struct {
SearchTerm basicgroupsprotocol.EntryProvided[SearchTerm]
}package seoscraper
import (
"context"
"fmt"
"math/rand/v2"
"net/http"
"net/url"
"time"
"github.com/chromedp/cdproto/network"
"github.com/chromedp/chromedp"
"github.com/futura-platform/protocol/flowprotocol"
)
func (t *Task) InitializeSession() flowprotocol.TaskStepResult {
b, cancel, err := t.SpawnSingleTabBrowser(t, t.GetProxy())
if err != nil {
return t.ReturnFatalErrorf("failed to spawn browser: %w", err)
}
defer cancel()
u, _ := url.Parse("https://www.google.com/search?" + url.Values{
"q": []string{fmt.Sprint(rand.Int())},
}.Encode())
// retrieve the session cookies
var cookies []*network.Cookie
err = chromedp.Run(b.CTX,
chromedp.Navigate(u.String()),
chromedp.ActionFunc(func(ctx context.Context) error {
var err error
cookies, err = network.GetCookies().Do(ctx)
return err
}),
)
if err != nil {
return t.ReturnSmallErrorf("failed to navigate to Google: %w", err)
}
httpCookies := make([]*http.Cookie, len(cookies))
for i, cookie := range cookies {
httpCookies[i] = &http.Cookie{
Name: cookie.Name,
Value: cookie.Value,
Domain: cookie.Domain,
Path: cookie.Path,
Expires: time.Unix(int64(cookie.Expires), 0),
}
}
t.GetCookieJar().SetCookies(u, httpCookies)
return t.ReturnBasicStepSuccess()
}func ConstructTask(base *protocol.Task[Params]) (protocol.BaseTask, []flowprotocol.TaskStep, error) {
t := &Task{
Task: base,
}
return t,
[]flowprotocol.TaskStep{
{StepFunc: t.InitializeSession},
},
nil
}type Task struct {
*protocol.Task[Params]
topLevelSearchResults []*url.URL
}resp, err := t.Get("https://www.google.com/search?"+url.Values{
"q": []string{string(*t.Params.SearchTerm)},
}.Encode(), getHeaders())
if err != nil {
return t.ReturnSmallErrorf("failed to make request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return t.ReturnFatalErrorf("bad status code: %d", resp.StatusCode)
}// parse the response body with goquery
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return t.ReturnSmallErrorf("failed to parse response body: %w", err)
}
// detect if we received the JS challenge page, if so then retry initialization
if doc.Find("script").FilterFunction(func(i int, s *goquery.Selection) bool {
return strings.Contains(s.Text(), `SG_SS=`)
}).Length() > 0 {
r := t.ReturnSmallErrorf("encountered JS challenge page")
r.NextStepLabel = "InitializeSession"
return r
}
// find the top-level search result anchors
resultAnchors := doc.Find("a[href]").FilterFunction(func(i int, s *goquery.Selection) bool {
first := s.Children().First()
return goquery.NodeName(first) == "h3"
})
t.topLevelSearchResults = make([]*url.URL, resultAnchors.Length())
for i, a := range resultAnchors.EachIter() {
href, exists := a.Attr("href")
if !exists {
return t.ReturnSmallErrorf("search result anchor does not have href attribute")
}
u, err := url.Parse(href)
if err != nil {
return t.ReturnSmallErrorf("failed to parse search result URL: %w", err)
}
t.topLevelSearchResults[i] = u
}
t.SmallSuccessf("fetched %d top-level search results for term '%s'", len(t.topLevelSearchResults), t.Params.SearchTerm)
for _, u := range t.topLevelSearchResults {
fmt.Println(" -", u.String())
}
return t.ReturnBasicStepSuccess()package seoscraper
import (
"fmt"
"strings"
"github.com/futura-platform/protocol/flowprotocol"
"github.com/go-resty/resty/v2"
)
const (
upSymbol = "⬆️"
downSymbol = "⬇️"
neutralSymbol = "➖"
newSymbol = "🆕"
)
var webhookClient = resty.New()
func (t *Task) ReportResults() flowprotocol.TaskStepResult {
var report strings.Builder
didChange := t.lastTopLevelSearchResults == nil
for i1, latestResult := range t.topLevelSearchResults {
var rankChange *int
for i2, lastResult := range t.lastTopLevelSearchResults {
if latestResult.String() == lastResult.String() {
delta := i2 - i1
rankChange = &delta
break
}
}
statusText := newSymbol
if rankChange != nil {
rc := *rankChange
if rc != 0 {
didChange = true
}
if rc < 0 {
statusText = downSymbol
} else if rc > 0 {
statusText = upSymbol
} else {
statusText = neutralSymbol
}
}
report.WriteString(fmt.Sprintf("%s - <%s>\n", statusText, latestResult.String()))
}
if didChange {
// report the change
resp, err := webhookClient.R().
SetHeader("Content-Type", "application/json").
SetBody(map[string]string{
"content": report.String(),
}).
Post(t.Params.ResultsWebhook)
if err != nil {
return t.ReturnSmallErrorf("failed to request webhook: %w", err)
} else if resp.StatusCode() != 204 {
return t.ReturnSmallErrorf("bad webhook status code %d", resp.StatusCode())
}
}
t.lastTopLevelSearchResults = t.topLevelSearchResults
// wait and scrape again
t.Sleep(t.GetErrorDelay())
return flowprotocol.TaskStepResult{
NextStepLabel: "FetchSearchResults",
}
}