Skip to content

Commit aa88ecc

Browse files
committed
cc warc
1 parent df32769 commit aa88ecc

File tree

7 files changed

+90060
-0
lines changed

7 files changed

+90060
-0
lines changed

pgml-apps/cc-warc/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
venv/
2+
__pycache__

pgml-apps/cc-warc/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
import Flask

pgml-apps/cc-warc/cli.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from warcio.archiveiterator import ArchiveIterator
2+
import click
3+
import asyncio
4+
import requests
5+
from pgml import Collection, Pipeline
6+
from config import DATABASE_URL
7+
8+
collection = Collection("warc", DATABASE_URL)
9+
pipeline = Pipeline("warc_search", {
10+
"body": {
11+
"splitter": {"model": "recursive_character"},
12+
"semantic_search": {
13+
"model": "mixedbread-ai/mxbai-embed-large-v1",
14+
}
15+
}
16+
})
17+
18+
async def ingest(paths, limit=500):
19+
await collection.add_pipeline(pipeline)
20+
with open(paths) as f:
21+
for path in f:
22+
req = requests.get("https://data.commoncrawl.org/%s" % path.strip(), stream=True)
23+
batch = []
24+
for record in ArchiveIterator(req.raw, arc2warc=True):
25+
document = {
26+
"id": record.rec_headers.get_header("WARC-Target-URI"),
27+
"body": record.content_stream().read().decode("utf-8")
28+
}
29+
print(document)
30+
batch.append(document)
31+
if len(batch) == limit:
32+
exit(1)
33+
# await collection.upsert_documents(batch)
34+
batch = []
35+
36+
37+
@click.command()
38+
@click.option("--path", help="Path to the WET paths file.", default="paths.txt")
39+
@click.option("--limit", default=5, help="How many files to download and ingest.")
40+
def cli(path, limit):
41+
asyncio.run(ingest(path, limit))
42+
43+
44+
if __name__ == "__main__":
45+
cli()

pgml-apps/cc-warc/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import os
2+
3+
DATABASE_URL = os.environ.get("PGML_DATABASE_URL", "postgres:///pgml")

pgml-apps/cc-warc/paths.txt

Lines changed: 90000 additions & 0 deletions
Large diffs are not rendered by default.

pgml-apps/cc-warc/qa.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Exception: `id` must be a key in document

pgml-apps/cc-warc/requirements.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
blinker==1.8.2
2+
click==8.1.7
3+
Flask==3.0.3
4+
itsdangerous==2.2.0
5+
Jinja2==3.1.4
6+
MarkupSafe==2.1.5
7+
pgml==1.1.0
8+
Werkzeug==3.0.3

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy