hackernewspaper/generator.py at main · bitfox-git/hackernewspaper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import os
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent
from config import asset_dir,ISSUE

from url_handlers import (
    DefaultHandler,
    GithubHandler,
    PDFHandler,
    YoutubeHandler,
    download_html,
)

os.makedirs(asset_dir, exist_ok=True)

ua = UserAgent()

paperdata = {
    "issue": ISSUE
}

# use BeautifulSoup to parse the html version of the newsletter
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup


# in the soup find the table with id="header" and look for the first <p> tag
# in the <p> return the innertext
def get_header(soup):
    header = soup.find(id="header")
    if header is None:
        return ""
    return header.p.get_text()

def parse_header(header):
    #header could be sometthing like a quote "The best way to predict the future is to invent it. // Alan Kay"
    #the author is separated by // from the quote
    split = header.split("//")
    if len(split) == 2:
        return split[0], split[1]
    else:
        return header, ""

class article:
    def __init__(self, mainurl, title, text, subtext, suburl, category):
        self.mainurl = mainurl
        self.title = title
        self.text = text
        self.subtext = subtext
        self.suburl = suburl
        self.category = category
    def __str__(self):
        return self.mainurl + "-" + self.title + "-" + self.text + "-" + self.subtext + "-" + self.suburl

def get_articles(soup):
    categories =[]
    articles = []

    content = soup.find(id="content")
    #all content starts with a h2
    for h2element in content.find_all("h2"):
        category = h2element.get_text()
        categories.append(category)
        #print(h2element.get_text())
        for nextSibling in h2element.next_siblings:
            if nextSibling.name == "h2":
                break
            if nextSibling.name == "p":

                a = nextSibling.find("a")
                if a is not None:
                    mainurl = a.get("href")
                    title = a.get("title")
                    text = a.get_text()
                    subtext =""
                    suburl = ""
                    span = nextSibling.find("span")
                    if span is not None:
                        subtext = span.text
                        suburla = span.find("a")
                        if suburla is not None:
                            suburl = suburla.get("href")
                    art = article(mainurl, title, text, subtext, suburl, category)
                    articles.append(art)
    return articles, categories


html = download_html("https://buttondown.com/hacker-newsletter/archive/hacker-newsletter-"+ISSUE)
soup = parse_html(html)
header = get_header(soup)

articles, categories = get_articles(soup)

# parse the content of each link
newsitems = []


handlers = [YoutubeHandler(), PDFHandler(), GithubHandler(), DefaultHandler()]

with sync_playwright() as p:
    browser = p.chromium.launch()
    for index, art in enumerate(articles):
        for handler in handlers:
            if handler.test(art):
                # TODO maybe make it so if it throws a exception fallback to another handler
                newsitems.append(handler.work(index, art, browser))
                print("New article indexed: ", newsitems[-1]["title"])
                break
    browser.close()

quoteLine, quoteAuthor = parse_header(header)

# remove any line breaks from quoteLine
quoteLine = quoteLine.replace("\n", "")

paperdata["quoteLine"] = quoteLine
paperdata["quoteAuthor"] = quoteAuthor


DICT_VALS = {
    'data' : paperdata,
    'categories': categories,
    'newsitems': newsitems
    }

# Do the latex stuff
from latexbuild import render_latex_template

PATH_JINJA2 = "."
PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2 = "template.tex"
PATH_OUTPUT_PDF = "MYOUTPUTFILE.pdf"

# Build Jinja2 template, compile result latex, move compiled file to output path,
# and clean up all intermediate files
#build_pdf(PATH_JINJA2, PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2, PATH_OUTPUT_PDF, DICT_VALS)
print("PDF template creation started")
latexresult = render_latex_template(
    PATH_JINJA2,
    PATH_TEMPLATE_RELATIVE_TO_PATH_JINJA2,
    DICT_VALS
    )

# store latexresult in a file using utf8 encoding
with open("output.tex", "w", encoding="utf-8") as f:
    print("PDF template creation finished, writing to file")
    f.write(latexresult)
    print("PDF template written to file")