-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_api.py
83 lines (73 loc) · 2.82 KB
/
scrape_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from bs4 import BeautifulSoup
from datetime import datetime
import requests
from typing import List
from fastapi import FastAPI
import re
import json
import uvicorn
# create a const base url
BASE_URL = "https://www.hellenictrain.gr"
app = FastAPI()
class ArticlePreview:
def __init__(self, title: str, url: str, preview: str, id: int):
self.title = title
self.url = url
self.preview = preview
self.id = id
class Article:
def __init__(self, baseArticle : ArticlePreview,content: str):
self.title = baseArticle.title.strip(" \n\t\r")
self.url = baseArticle.url
self.content = content.strip(" \n\t\r")
self.id = baseArticle.id
self.date = extractTimestamp(baseArticle.title)
def to_json(self):
return json.dumps(self.__dict__)
def getUrls(page: int = 0,english : bool = False) -> List[ArticlePreview]:
news_subpath = "/anakoinoseis-ht" if (not english) else "/en/announcements"
url = BASE_URL + news_subpath + ("?page=" + str(page) if page > 0 else "")
doc = requests.get(url)
soup = BeautifulSoup(doc.text, 'html.parser')
articles = soup.find_all('article')
result = []
for article in articles:
firstChild = article.find()
if firstChild and firstChild.get('class') == ['node--news--teaser']:
article_url = BASE_URL + article.get('about')
title = article.find(class_='news-title').text if article.find(class_='news-title') else "No title"
content = article.find(class_='news-body').text if article.find(class_='news-body') else "No preview"
id = int(article.get('data-history-node-id'))
result.append(ArticlePreview(title, article_url, content, id))
return result
def extractTimestamp(title: str) -> int:
match = re.search(r'\d{1,2}[-/]\d{1,2}[-/]\d{4}', title)
if match:
date = match.group()
return toTimestamp(date)
else:
print("No date found")
return None
def toTimestamp(date: str) -> int:
date_str = date.replace('-', '/')
date = datetime.strptime(date_str, '%d/%m/%Y')
return int(date.timestamp())
def extractArticle(preview: ArticlePreview) -> Article:
doc = requests.get(preview.url)
soup = BeautifulSoup(doc.text, 'html.parser')
content = soup.find(class_='news-body').text if soup.find(class_='news-body') else ""
return Article(preview, content)
# get the latest article
@app.get("/latest")
async def get_articles(en: bool = False):
return extractArticle((getUrls(english=en)[0]))
# get the latest page of articles
@app.get("/")
async def get_articles(page: int = 0, en: bool = False):
urls = getUrls(page, en)
articles = []
for url in urls:
articles.append(extractArticle(url))
return articles
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)