1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
#!/usr/bin/env python3
from http.server import BaseHTTPRequestHandler, HTTPServer
import chromadb
import collections
import hashlib
import json
def checksum(string):
sha256 = hashlib.sha256()
sha256.update(string.encode("utf-8"))
return sha256.hexdigest()[:32]
def ensure_list(data):
if isinstance(data, str):
return [data]
if isinstance(data, list):
if all(isinstance(l, str) for l in data):
return data
raise ValueError("Data must be a list of strings")
class MyRequestHandler(BaseHTTPRequestHandler):
def do_POST(self):
content_length = int(self.headers["Content-Length"])
post_data = self.rfile.read(content_length).decode("utf-8")
try:
data = json.loads(post_data)
# Process the JSON data
response_message = f"Received POST request with data: '{data}'\n"
except ValueError:
response_message = "Invalid JSON data"
self.send_response(400)
if query := data.get("query"):
response = collection.query(query_texts=ensure_list(query))
elif paragraph := data.get("store"):
data, metadata = drop_duplicates(paragraph)
collection.add(
documents=data, metadatas=metadata, ids=[checksum(l) for l in data]
)
response = {"added": data}
else:
raise ValueError(f"Used wrong method. Sent: {data.keys()}")
response_message = json.dumps(response)
self.send_response(200)
self.send_header("Content-type", "text/plain")
self.end_headers()
self.wfile.write(response_message.encode("utf-8"))
def run_server(port=8080):
server_address = ("", port)
httpd = HTTPServer(server_address, MyRequestHandler)
print(f"Server running on port {port}")
httpd.serve_forever()
def drop_duplicates(paragraph):
data = [data["document"] for data in paragraph]
metadata = [data["metadata"] for data in paragraph]
dups = (x for x, count in collections.Counter(data).items() if count > 1)
to_drop = []
for no in dups:
to_drop.extend([i for i, d in enumerate(data) if d == no][1:])
to_drop.sort(reverse=True)
for index in to_drop:
data.pop(index)
metadata.pop(index)
return data, metadata
def test():
sample = [
{"document": "Hello", "metadata": 5},
{"document": "World", "metadata": 8},
{"document": "Hello", "metadata": 6},
{"document": "Good", "metadata": 3},
{"document": "World", "metadata": 9},
]
assert drop_duplicates(sample) == (["Hello", "World", "Good"], [5, 8, 3])
if __name__ == "__main__":
client = chromadb.PersistentClient(path="./semgrep")
collection = client.get_or_create_collection("org-roam")
run_server()
|