Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit aaa464e

Browse files
authored
Cody web: add server-side fetching for URL mentions (#64223)
Due to CORS rules, cody web cannot make requests to arbitrary URLs. Because of this, mentions for URLs do not currently work in Cody Web. This adds a GraphQL endpoint to the cody context resolvers that resolves the contents of a mentioned URL.
1 parent eb1a76e commit aaa464e

File tree

4 files changed

+102
-0
lines changed

4 files changed

+102
-0
lines changed

cmd/frontend/graphqlbackend/cody_context.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ type CodyContextResolver interface {
1111
ChatContext(ctx context.Context, args ChatContextArgs) (ChatContextResolver, error)
1212
RankContext(ctx context.Context, args RankContextArgs) (RankContextResolver, error)
1313
RecordContext(ctx context.Context, args RecordContextArgs) (*EmptyResponse, error)
14+
UrlMentionContext(ctx context.Context, args UrlMentionContextArgs) (UrlMentionContextResolver, error)
1415
// GetCodyContext is the existing Cody Enterprise context endpoint
1516
GetCodyContext(ctx context.Context, args GetContextArgs) ([]ContextResultResolver, error)
1617
}
@@ -22,6 +23,15 @@ type GetContextArgs struct {
2223
TextResultsCount int32
2324
}
2425

26+
type UrlMentionContextArgs struct {
27+
Url string
28+
}
29+
30+
type UrlMentionContextResolver interface {
31+
Title() *string
32+
Content() string
33+
}
34+
2535
type ContextResultResolver interface {
2636
ToFileChunkContext() (*FileChunkContextResolver, bool)
2737
}

cmd/frontend/graphqlbackend/cody_context.graphql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,11 @@ extend type Query {
112112
"""
113113
resultsCount: Int
114114
): ChatContextResult!
115+
116+
"""
117+
EXPERIMENTAL: Fetches the relevant context for a mentioned URL
118+
"""
119+
urlMentionContext(url: String!): URLMentionContextResult!
115120
}
116121

117122
"""
@@ -270,3 +275,18 @@ type RetrieverContextItem {
270275
"""
271276
item: CodyContextResult!
272277
}
278+
279+
"""
280+
EXPERIMENTAL: The result of fetching context for a URL mention
281+
"""
282+
type URLMentionContextResult {
283+
"""
284+
The extracted title of the page, if it exists
285+
"""
286+
title: String
287+
"""
288+
The content of the page in its processed and truncated form.
289+
Not guaranteed to be in any particular format.
290+
"""
291+
content: String!
292+
}

cmd/frontend/internal/context/resolvers/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ go_library(
2424
"//schema",
2525
"@com_github_cohere_ai_cohere_go_v2//:cohere-go",
2626
"@com_github_cohere_ai_cohere_go_v2//client",
27+
"@com_github_grafana_regexp//:regexp",
28+
"@com_github_k3a_html2text//:html2text",
2729
"@com_github_sourcegraph_conc//iter",
2830
"@com_github_sourcegraph_conc//pool",
2931
"@com_github_sourcegraph_log//:log",

cmd/frontend/internal/context/resolvers/context.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ import (
88
"net/http"
99
"time"
1010

11+
"github.com/grafana/regexp"
12+
"github.com/k3a/html2text"
1113
"github.com/sourcegraph/conc/iter"
1214
"github.com/sourcegraph/conc/pool"
1315
"github.com/sourcegraph/log"
@@ -427,6 +429,74 @@ func (r *Resolver) fetchZoekt(ctx context.Context, query string, repo *types.Rep
427429
return res, partialErrors, nil
428430
}
429431

432+
var titleRegexp = regexp.MustCompile(`<title>([^<]+)</title>`)
433+
434+
const urlContextReadLimit = 5 * 1024 * 1024
435+
const urlContextOutputLimit = 14000
436+
437+
type urlMentionContextResponse struct {
438+
title *string
439+
content string
440+
}
441+
442+
func (u *urlMentionContextResponse) Title() *string {
443+
return u.title
444+
}
445+
446+
func (u *urlMentionContextResponse) Content() string {
447+
return u.content
448+
}
449+
450+
func (r *Resolver) UrlMentionContext(ctx context.Context, args graphqlbackend.UrlMentionContextArgs) (graphqlbackend.UrlMentionContextResolver, error) {
451+
req, err := http.NewRequestWithContext(ctx, "GET", args.Url, nil)
452+
if err != nil {
453+
return nil, err
454+
}
455+
456+
// 🚨 SECURITY: This endpoint allows API users to create GET requests against arbitrary URLs.
457+
// To mitigate risk of SSRF, we use an the ExternalClient, which denies requests to internal targets.
458+
resp, err := httpcli.UncachedExternalClient.Do(req)
459+
if err != nil {
460+
return nil, err
461+
}
462+
defer resp.Body.Close()
463+
464+
if resp.StatusCode >= http.StatusBadRequest {
465+
return nil, errors.Errorf("request failed with status %d", resp.StatusCode)
466+
}
467+
468+
// 🚨 SECURITY: Limit the amount of data we will read into memory.
469+
content, err := io.ReadAll(io.LimitReader(resp.Body, urlContextReadLimit))
470+
if err != nil {
471+
return nil, err
472+
}
473+
474+
// Attempt to extract the title
475+
var title *string
476+
if match := titleRegexp.FindSubmatch(content); match != nil {
477+
title = pointers.Ptr(string(match[1]))
478+
}
479+
480+
// Trim to main if it exists since that's a decent signal pointing to the important part of the page.
481+
if idx := bytes.Index(content, []byte("<main")); idx > 0 {
482+
content = content[idx:]
483+
}
484+
if idx := bytes.Index(content, []byte("</main>")); idx > 0 {
485+
content = content[:idx+len("</main>")]
486+
}
487+
488+
// Convert the HTML to text to make the ouptut higher density. The output
489+
// is still pretty crude, but it does enough to capture the description and
490+
// most comments from a github PR. There is significant room to improve
491+
// content extraction here.
492+
textified := html2text.HTML2TextWithOptions(string(content), html2text.WithUnixLineBreaks())
493+
textified = textified[:min(len(textified), urlContextOutputLimit)]
494+
return &urlMentionContextResponse{
495+
title: title,
496+
content: textified,
497+
}, nil
498+
}
499+
430500
// countLines finds the number of lines corresponding to the number of runes. We 'round down'
431501
// to ensure that we don't return more characters than our budget.
432502
func countLines(content string, numRunes int) int {

0 commit comments

Comments
 (0)