Spaces:
Sleeping
Sleeping
app
Browse files- main.py +51 -19
- requirements.txt +2 -1
main.py
CHANGED
|
@@ -9,7 +9,7 @@ import requests
|
|
| 9 |
import schedule
|
| 10 |
import srsly
|
| 11 |
from bs4 import BeautifulSoup
|
| 12 |
-
from datasets import Dataset, Image, load_dataset
|
| 13 |
from huggingface_hub import create_repo, login, whoami
|
| 14 |
from PIL import Image as PILImage
|
| 15 |
from retry import retry
|
|
@@ -19,7 +19,8 @@ dotenv.load_dotenv()
|
|
| 19 |
login(token=os.environ.get("HF_TOKEN"))
|
| 20 |
|
| 21 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
########################################################
|
|
@@ -66,7 +67,7 @@ def get_zotero_items(debug=False):
|
|
| 66 |
print(f"# items fetched {len(items)}")
|
| 67 |
|
| 68 |
if debug:
|
| 69 |
-
if len(items) >
|
| 70 |
break
|
| 71 |
|
| 72 |
return items
|
|
@@ -103,11 +104,18 @@ def get_arxiv_items(items):
|
|
| 103 |
if arxiv_id in visited:
|
| 104 |
continue
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
arxiv_items.append(
|
| 107 |
{
|
| 108 |
"arxiv_id": arxiv_id,
|
| 109 |
"arxiv_url": arxiv_url,
|
|
|
|
|
|
|
| 110 |
"pdf_url": pdf_url,
|
|
|
|
| 111 |
"added_by": item["meta"]["createdByUser"]["username"],
|
| 112 |
"date_added": data.get("dateAdded", ""),
|
| 113 |
}
|
|
@@ -129,10 +137,10 @@ def fetch_arxiv_htmls(arxiv_items):
|
|
| 129 |
for item in tqdm(arxiv_items):
|
| 130 |
html = fetch_arxiv_html(item["arxiv_id"])
|
| 131 |
if html:
|
| 132 |
-
item["
|
| 133 |
else:
|
| 134 |
print(f"failed to fetch html for {item['arxiv_id']}")
|
| 135 |
-
item["
|
| 136 |
|
| 137 |
return arxiv_items
|
| 138 |
|
|
@@ -392,8 +400,6 @@ def create_hf_image_dataset(base_dir):
|
|
| 392 |
"image": [d["image"] for d in data],
|
| 393 |
"arxiv_id": [d["arxiv_id"] for d in data],
|
| 394 |
"page_number": [d["page_number"] for d in data],
|
| 395 |
-
"width": [d["width"] for d in data],
|
| 396 |
-
"height": [d["height"] for d in data],
|
| 397 |
}
|
| 398 |
)
|
| 399 |
|
|
@@ -409,9 +415,17 @@ def create_hf_image_dataset(base_dir):
|
|
| 409 |
|
| 410 |
|
| 411 |
def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
| 412 |
-
repo_id = HF_REPO_ID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
create_repo(
|
| 414 |
-
repo_id=
|
| 415 |
token=os.environ.get("HF_TOKEN"),
|
| 416 |
private=True,
|
| 417 |
repo_type="dataset",
|
|
@@ -421,20 +435,28 @@ def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
|
| 421 |
# upload image dataset
|
| 422 |
try:
|
| 423 |
img_ds = create_hf_image_dataset("data/arxiv_images")
|
| 424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
|
|
|
|
| 426 |
# push id_to_abstract
|
| 427 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
| 428 |
-
abstract_ds.push_to_hub(
|
| 429 |
|
| 430 |
# push arxiv_items
|
| 431 |
arxiv_ds = Dataset.from_pandas(contents_df)
|
| 432 |
-
arxiv_ds.push_to_hub(
|
| 433 |
|
| 434 |
# push processed_arxiv_ids
|
| 435 |
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
| 436 |
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
| 437 |
-
processed_arxiv_ids_ds.push_to_hub(
|
| 438 |
except Exception as e:
|
| 439 |
print(e)
|
| 440 |
|
|
@@ -454,7 +476,7 @@ def main():
|
|
| 454 |
|
| 455 |
# get already processed arxiv ids from HF
|
| 456 |
try:
|
| 457 |
-
existing_arxiv_ids = load_dataset(
|
| 458 |
except Exception as e:
|
| 459 |
print(e)
|
| 460 |
try:
|
|
@@ -471,14 +493,15 @@ def main():
|
|
| 471 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 472 |
|
| 473 |
processed_arxiv_ids = set()
|
|
|
|
|
|
|
| 474 |
for item in arxiv_items:
|
| 475 |
# download images --
|
| 476 |
save_arxiv_article_images(item["arxiv_id"])
|
| 477 |
|
| 478 |
# parse html
|
| 479 |
try:
|
| 480 |
-
item["contents"] = parse_html_content(item["
|
| 481 |
-
processed_arxiv_ids.add(item["arxiv_id"])
|
| 482 |
except Exception as e:
|
| 483 |
print(f"Failed to parse html for {item['arxiv_id']}: {e}")
|
| 484 |
item["contents"] = []
|
|
@@ -486,12 +509,21 @@ def main():
|
|
| 486 |
if len(item["contents"]) == 0:
|
| 487 |
print("Extracting from pdf...")
|
| 488 |
md_content = get_pdf_text(item["arxiv_id"]) # fix this
|
|
|
|
|
|
|
| 489 |
if md_content:
|
| 490 |
item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
|
| 491 |
-
processed_arxiv_ids.add(item["arxiv_id"])
|
| 492 |
else:
|
| 493 |
item["contents"] = []
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
# save contents ---
|
| 496 |
processed_arxiv_ids = list(processed_arxiv_ids)
|
| 497 |
print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
|
|
@@ -509,7 +541,7 @@ def main():
|
|
| 509 |
|
| 510 |
# add to existing dataset
|
| 511 |
try:
|
| 512 |
-
old_abstract_df = load_dataset(
|
| 513 |
except Exception as e:
|
| 514 |
print(e)
|
| 515 |
old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
|
|
@@ -522,7 +554,7 @@ def main():
|
|
| 522 |
contents_df = pd.DataFrame(arxiv_items)
|
| 523 |
print(contents_df.head())
|
| 524 |
try:
|
| 525 |
-
old_contents_df = load_dataset(
|
| 526 |
except Exception as e:
|
| 527 |
print(e)
|
| 528 |
old_contents_df = pd.DataFrame(columns=contents_df.columns)
|
|
|
|
| 9 |
import schedule
|
| 10 |
import srsly
|
| 11 |
from bs4 import BeautifulSoup
|
| 12 |
+
from datasets import Dataset, Image, concatenate_datasets, load_dataset
|
| 13 |
from huggingface_hub import create_repo, login, whoami
|
| 14 |
from PIL import Image as PILImage
|
| 15 |
from retry import retry
|
|
|
|
| 19 |
login(token=os.environ.get("HF_TOKEN"))
|
| 20 |
|
| 21 |
hf_user = whoami(os.environ.get("HF_TOKEN"))["name"]
|
| 22 |
+
HF_REPO_ID_TXT = f"{hf_user}/zotero-answer-ai-article-texts"
|
| 23 |
+
HF_REPO_ID_IMG = f"{hf_user}/zotero-answer-ai-article-images"
|
| 24 |
|
| 25 |
|
| 26 |
########################################################
|
|
|
|
| 67 |
print(f"# items fetched {len(items)}")
|
| 68 |
|
| 69 |
if debug:
|
| 70 |
+
if len(items) > 1500:
|
| 71 |
break
|
| 72 |
|
| 73 |
return items
|
|
|
|
| 104 |
if arxiv_id in visited:
|
| 105 |
continue
|
| 106 |
|
| 107 |
+
authors = []
|
| 108 |
+
for author in data.get("creators", []):
|
| 109 |
+
authors.append(f"{author.get('firstName', '')} {author.get('lastName', '')}")
|
| 110 |
+
|
| 111 |
arxiv_items.append(
|
| 112 |
{
|
| 113 |
"arxiv_id": arxiv_id,
|
| 114 |
"arxiv_url": arxiv_url,
|
| 115 |
+
"title": data.get("title", ""),
|
| 116 |
+
"authors": authors,
|
| 117 |
"pdf_url": pdf_url,
|
| 118 |
+
"date_published": data.get("date", ""),
|
| 119 |
"added_by": item["meta"]["createdByUser"]["username"],
|
| 120 |
"date_added": data.get("dateAdded", ""),
|
| 121 |
}
|
|
|
|
| 137 |
for item in tqdm(arxiv_items):
|
| 138 |
html = fetch_arxiv_html(item["arxiv_id"])
|
| 139 |
if html:
|
| 140 |
+
item["raw_content"] = html
|
| 141 |
else:
|
| 142 |
print(f"failed to fetch html for {item['arxiv_id']}")
|
| 143 |
+
item["raw_content"] = "Error"
|
| 144 |
|
| 145 |
return arxiv_items
|
| 146 |
|
|
|
|
| 400 |
"image": [d["image"] for d in data],
|
| 401 |
"arxiv_id": [d["arxiv_id"] for d in data],
|
| 402 |
"page_number": [d["page_number"] for d in data],
|
|
|
|
|
|
|
| 403 |
}
|
| 404 |
)
|
| 405 |
|
|
|
|
| 415 |
|
| 416 |
|
| 417 |
def upload_to_hf(abstract_df, contents_df, processed_arxiv_ids):
|
| 418 |
+
# repo_id = HF_REPO_ID
|
| 419 |
+
create_repo(
|
| 420 |
+
repo_id=HF_REPO_ID_TXT,
|
| 421 |
+
token=os.environ.get("HF_TOKEN"),
|
| 422 |
+
private=True,
|
| 423 |
+
repo_type="dataset",
|
| 424 |
+
exist_ok=True,
|
| 425 |
+
)
|
| 426 |
+
|
| 427 |
create_repo(
|
| 428 |
+
repo_id=HF_REPO_ID_IMG,
|
| 429 |
token=os.environ.get("HF_TOKEN"),
|
| 430 |
private=True,
|
| 431 |
repo_type="dataset",
|
|
|
|
| 435 |
# upload image dataset
|
| 436 |
try:
|
| 437 |
img_ds = create_hf_image_dataset("data/arxiv_images")
|
| 438 |
+
try:
|
| 439 |
+
old_img_ds = load_dataset(HF_REPO_ID_IMG, "images")["train"]
|
| 440 |
+
img_ds = concatenate_datasets([old_img_ds, img_ds])
|
| 441 |
+
except Exception as e:
|
| 442 |
+
print(e)
|
| 443 |
+
img_ds.push_to_hub(HF_REPO_ID_IMG, "images", token=os.environ.get("HF_TOKEN"))
|
| 444 |
+
except Exception as e:
|
| 445 |
+
print(e)
|
| 446 |
|
| 447 |
+
try:
|
| 448 |
# push id_to_abstract
|
| 449 |
abstract_ds = Dataset.from_pandas(abstract_df)
|
| 450 |
+
abstract_ds.push_to_hub(HF_REPO_ID_TXT, "abstracts", token=os.environ.get("HF_TOKEN"))
|
| 451 |
|
| 452 |
# push arxiv_items
|
| 453 |
arxiv_ds = Dataset.from_pandas(contents_df)
|
| 454 |
+
arxiv_ds.push_to_hub(HF_REPO_ID_TXT, "articles", token=os.environ.get("HF_TOKEN"))
|
| 455 |
|
| 456 |
# push processed_arxiv_ids
|
| 457 |
processed_arxiv_ids = [{"arxiv_id": arxiv_id} for arxiv_id in processed_arxiv_ids]
|
| 458 |
processed_arxiv_ids_ds = Dataset.from_list(processed_arxiv_ids)
|
| 459 |
+
processed_arxiv_ids_ds.push_to_hub(HF_REPO_ID_TXT, "processed_arxiv_ids", token=os.environ.get("HF_TOKEN"))
|
| 460 |
except Exception as e:
|
| 461 |
print(e)
|
| 462 |
|
|
|
|
| 476 |
|
| 477 |
# get already processed arxiv ids from HF
|
| 478 |
try:
|
| 479 |
+
existing_arxiv_ids = load_dataset(HF_REPO_ID_TXT, "processed_arxiv_ids")["train"]["arxiv_id"]
|
| 480 |
except Exception as e:
|
| 481 |
print(e)
|
| 482 |
try:
|
|
|
|
| 493 |
print(f"# of new arxiv items: {len(arxiv_items)}")
|
| 494 |
|
| 495 |
processed_arxiv_ids = set()
|
| 496 |
+
pbar = tqdm(range(len(arxiv_items)))
|
| 497 |
+
|
| 498 |
for item in arxiv_items:
|
| 499 |
# download images --
|
| 500 |
save_arxiv_article_images(item["arxiv_id"])
|
| 501 |
|
| 502 |
# parse html
|
| 503 |
try:
|
| 504 |
+
item["contents"] = parse_html_content(item["raw_content"])
|
|
|
|
| 505 |
except Exception as e:
|
| 506 |
print(f"Failed to parse html for {item['arxiv_id']}: {e}")
|
| 507 |
item["contents"] = []
|
|
|
|
| 509 |
if len(item["contents"]) == 0:
|
| 510 |
print("Extracting from pdf...")
|
| 511 |
md_content = get_pdf_text(item["arxiv_id"]) # fix this
|
| 512 |
+
item["raw_content"] = md_content
|
| 513 |
+
|
| 514 |
if md_content:
|
| 515 |
item["contents"] = parse_markdown_content(md_content, item["arxiv_id"])
|
|
|
|
| 516 |
else:
|
| 517 |
item["contents"] = []
|
| 518 |
|
| 519 |
+
if len(item["contents"]) > 0:
|
| 520 |
+
processed_arxiv_ids.add(item["arxiv_id"])
|
| 521 |
+
if len(item["authors"]) == 0:
|
| 522 |
+
item["authors"] = [] # ["unknown"]
|
| 523 |
+
item["title"] = item["contents"][0]["paper_title"]
|
| 524 |
+
pbar.update(1)
|
| 525 |
+
pbar.close()
|
| 526 |
+
|
| 527 |
# save contents ---
|
| 528 |
processed_arxiv_ids = list(processed_arxiv_ids)
|
| 529 |
print(f"# of processed arxiv ids: {len(processed_arxiv_ids)}")
|
|
|
|
| 541 |
|
| 542 |
# add to existing dataset
|
| 543 |
try:
|
| 544 |
+
old_abstract_df = load_dataset(HF_REPO_ID_TXT, "abstracts")["train"].to_pandas()
|
| 545 |
except Exception as e:
|
| 546 |
print(e)
|
| 547 |
old_abstract_df = pd.DataFrame(columns=abstract_df.columns)
|
|
|
|
| 554 |
contents_df = pd.DataFrame(arxiv_items)
|
| 555 |
print(contents_df.head())
|
| 556 |
try:
|
| 557 |
+
old_contents_df = load_dataset(HF_REPO_ID_TXT, "articles")["train"].to_pandas()
|
| 558 |
except Exception as e:
|
| 559 |
print(e)
|
| 560 |
old_contents_df = pd.DataFrame(columns=contents_df.columns)
|
requirements.txt
CHANGED
|
@@ -12,4 +12,5 @@ retry
|
|
| 12 |
pandas
|
| 13 |
datasets
|
| 14 |
PyMuPDF
|
| 15 |
-
pillow
|
|
|
|
|
|
| 12 |
pandas
|
| 13 |
datasets
|
| 14 |
PyMuPDF
|
| 15 |
+
pillow
|
| 16 |
+
tqdm
|