FileDistance.cpp
7.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
//===--- FileDistance.cpp - File contents container -------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The FileDistance structure allows calculating the minimum distance to paths
// in a single tree.
// We simply walk up the path's ancestors until we find a node whose cost is
// known, and add the cost of walking back down. Initialization ensures this
// gives the correct path to the roots.
// We cache the results, so that the runtime is O(|A|), where A is the set of
// all distinct ancestors of visited paths.
//
// Example after initialization with /=2, /bar=0, DownCost = 1:
// / = 2
// /bar = 0
//
// After querying /foo/bar and /bar/foo:
// / = 2
// /bar = 0
// /bar/foo = 1
// /foo = 3
// /foo/bar = 4
//
// URIDistance creates FileDistance lazily for each URI scheme encountered. In
// practice this is a small constant factor.
//
//===-------------------------------------------------------------------------//
#include "FileDistance.h"
#include "Logger.h"
#include "llvm/ADT/STLExtras.h"
#include <queue>
namespace clang {
namespace clangd {
// Convert a path into the canonical form.
// Canonical form is either "/", or "/segment" * N:
// C:\foo\bar --> /c:/foo/bar
// /foo/ --> /foo
// a/b/c --> /a/b/c
static llvm::SmallString<128> canonicalize(llvm::StringRef Path) {
llvm::SmallString<128> Result = Path.rtrim('/');
native(Result, llvm::sys::path::Style::posix);
if (Result.empty() || Result.front() != '/')
Result.insert(Result.begin(), '/');
return Result;
}
constexpr const unsigned FileDistance::Unreachable;
const llvm::hash_code FileDistance::RootHash =
llvm::hash_value(llvm::StringRef("/"));
FileDistance::FileDistance(llvm::StringMap<SourceParams> Sources,
const FileDistanceOptions &Opts)
: Opts(Opts) {
llvm::DenseMap<llvm::hash_code, llvm::SmallVector<llvm::hash_code, 4>>
DownEdges;
// Compute the best distance following only up edges.
// Keep track of down edges, in case we can use them to improve on this.
for (const auto &S : Sources) {
auto Canonical = canonicalize(S.getKey());
dlog("Source {0} = {1}, MaxUp = {2}", Canonical, S.second.Cost,
S.second.MaxUpTraversals);
// Walk up to ancestors of this source, assigning cost.
llvm::StringRef Rest = Canonical;
llvm::hash_code Hash = llvm::hash_value(Rest);
for (unsigned I = 0; !Rest.empty(); ++I) {
Rest = parent_path(Rest, llvm::sys::path::Style::posix);
auto NextHash = llvm::hash_value(Rest);
auto &Down = DownEdges[NextHash];
if (!llvm::is_contained(Down, Hash))
Down.push_back(Hash);
// We can't just break after MaxUpTraversals, must still set DownEdges.
if (I > S.getValue().MaxUpTraversals) {
if (Cache.find(Hash) != Cache.end())
break;
} else {
unsigned Cost = S.getValue().Cost + I * Opts.UpCost;
auto R = Cache.try_emplace(Hash, Cost);
if (!R.second) {
if (Cost < R.first->second) {
R.first->second = Cost;
} else {
// If we're not the best way to get to this path, stop assigning.
break;
}
}
}
Hash = NextHash;
}
}
// Now propagate scores parent -> child if that's an improvement.
// BFS ensures we propagate down chains (must visit parents before children).
std::queue<llvm::hash_code> Next;
for (auto Child : DownEdges.lookup(llvm::hash_value(llvm::StringRef(""))))
Next.push(Child);
while (!Next.empty()) {
auto Parent = Next.front();
Next.pop();
auto ParentCost = Cache.lookup(Parent);
for (auto Child : DownEdges.lookup(Parent)) {
if (Parent != RootHash || Opts.AllowDownTraversalFromRoot) {
auto &ChildCost =
Cache.try_emplace(Child, Unreachable).first->getSecond();
if (ParentCost + Opts.DownCost < ChildCost)
ChildCost = ParentCost + Opts.DownCost;
}
Next.push(Child);
}
}
}
unsigned FileDistance::distance(llvm::StringRef Path) {
auto Canonical = canonicalize(Path);
unsigned Cost = Unreachable;
llvm::SmallVector<llvm::hash_code, 16> Ancestors;
// Walk up ancestors until we find a path we know the distance for.
for (llvm::StringRef Rest = Canonical; !Rest.empty();
Rest = parent_path(Rest, llvm::sys::path::Style::posix)) {
auto Hash = llvm::hash_value(Rest);
if (Hash == RootHash && !Ancestors.empty() &&
!Opts.AllowDownTraversalFromRoot) {
Cost = Unreachable;
break;
}
auto It = Cache.find(Hash);
if (It != Cache.end()) {
Cost = It->second;
break;
}
Ancestors.push_back(Hash);
}
// Now we know the costs for (known node, queried node].
// Fill these in, walking down the directory tree.
for (llvm::hash_code Hash : llvm::reverse(Ancestors)) {
if (Cost != Unreachable)
Cost += Opts.DownCost;
Cache.try_emplace(Hash, Cost);
}
dlog("distance({0} = {1})", Path, Cost);
return Cost;
}
unsigned URIDistance::distance(llvm::StringRef URI) {
auto R = Cache.try_emplace(llvm::hash_value(URI), FileDistance::Unreachable);
if (!R.second)
return R.first->getSecond();
if (auto U = clangd::URI::parse(URI)) {
dlog("distance({0} = {1})", URI, U->body());
R.first->second = forScheme(U->scheme()).distance(U->body());
} else {
log("URIDistance::distance() of unparseable {0}: {1}", URI, U.takeError());
}
return R.first->second;
}
FileDistance &URIDistance::forScheme(llvm::StringRef Scheme) {
auto &Delegate = ByScheme[Scheme];
if (!Delegate) {
llvm::StringMap<SourceParams> SchemeSources;
for (const auto &Source : Sources) {
if (auto U = clangd::URI::create(Source.getKey(), Scheme))
SchemeSources.try_emplace(U->body(), Source.getValue());
else
llvm::consumeError(U.takeError());
}
dlog("FileDistance for scheme {0}: {1}/{2} sources", Scheme,
SchemeSources.size(), Sources.size());
Delegate.reset(new FileDistance(std::move(SchemeSources), Opts));
}
return *Delegate;
}
static std::pair<std::string, int> scopeToPath(llvm::StringRef Scope) {
llvm::SmallVector<llvm::StringRef, 4> Split;
Scope.split(Split, "::", /*MaxSplit=*/-1, /*KeepEmpty=*/false);
return {"/" + llvm::join(Split, "/"), Split.size()};
}
static FileDistance
createScopeFileDistance(llvm::ArrayRef<std::string> QueryScopes) {
FileDistanceOptions Opts;
Opts.UpCost = 2;
Opts.DownCost = 4;
Opts.AllowDownTraversalFromRoot = false;
llvm::StringMap<SourceParams> Sources;
llvm::StringRef Preferred =
QueryScopes.empty() ? "" : QueryScopes.front().c_str();
for (llvm::StringRef S : QueryScopes) {
SourceParams Param;
// Penalize the global scope even it's preferred, as all projects can define
// symbols in it, and there is pattern where using-namespace is used in
// place of enclosing namespaces (e.g. in implementation files).
if (S == Preferred)
Param.Cost = S == "" ? 4 : 0;
else if (Preferred.startswith(S) && !S.empty())
continue; // just rely on up-traversals.
else
Param.Cost = S == "" ? 6 : 2;
auto Path = scopeToPath(S);
// The global namespace is not 'near' its children.
Param.MaxUpTraversals = std::max(Path.second - 1, 0);
Sources[Path.first] = std::move(Param);
}
return FileDistance(std::move(Sources), Opts);
}
ScopeDistance::ScopeDistance(llvm::ArrayRef<std::string> QueryScopes)
: Distance(createScopeFileDistance(QueryScopes)) {}
unsigned ScopeDistance::distance(llvm::StringRef SymbolScope) {
return Distance.distance(scopeToPath(SymbolScope).first);
}
} // namespace clangd
} // namespace clang