-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgenerator.py
More file actions
156 lines (121 loc) · 4.44 KB
/
Copy pathgenerator.py
File metadata and controls
156 lines (121 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent
from config import asset_dir,ISSUE
from url_handlers import (
DefaultHandler,
GithubHandler,
PDFHandler,
YoutubeHandler,
download_html,
)
os.makedirs(asset_dir, exist_ok=True)
ua = UserAgent()
paperdata = {
"issue": ISSUE
}
# use BeautifulSoup to parse the html version of the newsletter
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
return soup
# in the soup find the table with id="header" and look for the first <p> tag
# in the <p> return the innertext
def get_header(soup):
header = soup.find(id="header")
if header is None:
return ""
return header.p.get_text()
def parse_header(header):
#header could be sometthing like a quote "The best way to predict the future is to invent it. // Alan Kay"
#the author is separated by // from the quote
split = header.split("//")
if len(split) == 2:
return split[0], split[1]
else:
return header, ""
class article:
def __init__(self, mainurl, title, text, subtext, suburl, category):
self.mainurl = mainurl
self.title = title
self.text = text
self.subtext = subtext
self.suburl = suburl
self.category = category
def __str__(self):
return self.mainurl + "-" + self.title + "-" + self.text + "-" + self.subtext + "-" + self.suburl
def get_articles(soup):
categories =[]
articles = []
content = soup.find(id="content")
#all content starts with a h2
for h2element in content.find_all("h2"):
category = h2element.get_text()
categories.append(category)
#print(h2element.get_text())
for nextSibling in h2element.next_siblings:
if nextSibling.name == "h2":
break
if nextSibling.name == "p":
a = nextSibling.find("a")
if a is not None:
mainurl = a.get("href")
title = a.get("title")
text = a.get_text()
subtext =""
suburl = ""
span = nextSibling.find("span")
if span is not None:
subtext = span.text
suburla = span.find("a")
if suburla is not None:
suburl = suburla.get("href")
art = article(mainurl, title, text, subtext, suburl, category)
articles.append(art)
return articles, categories
html = download_html("https://buttondown.com/hacker-newsletter/archive/hacker-newsletter-"+ISSUE)
soup = parse_html(html)
header = get_header(soup)
articles, categories = get_articles(soup)
# parse the content of each link
newsitems = []
handlers = [YoutubeHandler(), PDFHandler(), GithubHandler(), DefaultHandler()]
with sync_playwright() as p:
browser = p.chromium.launch()
for index, art in enumerate(articles):
for handler in handlers:
if handler.test(art):
# TODO maybe make it so if it throws a exception fallback to another handler
newsitems.append(handler.work(index, art, browser))
print("New article indexed: ", newsitems[-1]["title"])
break
browser.close()
quoteLine, quoteAuthor = parse_header(header)
# remove any line breaks from quoteLine
quoteLine = quoteLine.replace("\n", "")
paperdata["quoteLine"] = quoteLine
paperdata["quoteAuthor"] = quoteAuthor
DICT_VALS = {
'data' : paperdata,
'categories': categories,
'newsitems': newsitems
}
# Do the latex stuff
from latexbuild import render_latex_template
PATH_JINJA2 = "."
PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2 = "template.tex"
PATH_OUTPUT_PDF = "MYOUTPUTFILE.pdf"
# Build Jinja2 template, compile result latex, move compiled file to output path,
# and clean up all intermediate files
#build_pdf(PATH_JINJA2, PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2, PATH_OUTPUT_PDF, DICT_VALS)
print("PDF template creation started")
latexresult = render_latex_template(
PATH_JINJA2,
PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2,
DICT_VALS
)
# store latexresult in a file using utf8 encoding
with open("output.tex", "w", encoding="utf-8") as f:
print("PDF template creation finished, writing to file")
f.write(latexresult)
print("PDF template written to file")