Skip to content

Commit

Permalink
Merge pull request #1 from lfoppiano/feature/selective-pdf-rendering
Browse files Browse the repository at this point in the history
Render only a selection of pages
  • Loading branch information
lfoppiano committed Feb 13, 2024
2 parents d2998f6 + 61dd5da commit 9a65d5d
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 25 deletions.
15 changes: 13 additions & 2 deletions grobid/grobid_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class GrobidProcessor:
def __init__(self, grobid_client):
self.grobid_client = grobid_client

def process_structure(self, input_path):
def process_structure(self, input_path) -> (dict, int):
pdf_file, status, text = self.grobid_client.process_pdf("processFulltextDocument",
input_path,
consolidate_header=True,
Expand All @@ -41,8 +41,9 @@ def process_structure(self, input_path):
return

coordinates = self.get_coordinates(text)
pages = self.get_pages(text)

return coordinates
return coordinates, len(pages)

@staticmethod
def box_to_dict(box, color=None, type=None):
Expand Down Expand Up @@ -76,3 +77,13 @@ def get_coordinates(self, text):
)
count += 1
return coordinates

def get_pages(self, text):
soup = BeautifulSoup(text, 'xml')
pages_infos = soup.find_all("surface")

pages = [{'width': float(page['lrx']) - float(page['ulx']), 'height': float(page['lry']) - float(page['uly'])}
for page in pages_infos]

return pages

89 changes: 66 additions & 23 deletions streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@
if 'annotations' not in st.session_state:
st.session_state['annotations'] = None

if 'pages' not in st.session_state:
st.session_state['pages'] = None

if 'page_selection' not in st.session_state:
st.session_state['page_selection'] = []

st.set_page_config(
page_title="Structure vision",
page_icon="",
Expand All @@ -58,10 +64,23 @@
highlight_figures = st.toggle('Figures and tables', value=True, disabled=not st.session_state['uploaded'])
highlight_callout = st.toggle('References citations in text', value=True, disabled=not st.session_state['uploaded'])
highlight_citations = st.toggle('Citations', value=True, disabled=not st.session_state['uploaded'])
st.divider()

st.header("Display options")
annotation_thickness = st.slider(label="Annotation boxes border thickness", min_value=1, max_value=6, value=1)
pages_vertical_spacing = st.slider(label="Pages vertical spacing", min_value=2, max_value=10, value=2)

st.header("Page Selection")
placeholder = st.empty()

if not st.session_state['pages']:
st.session_state['page_selection'] = placeholder.multiselect(
"Select pages to display",
options=[],
default=[],
help="The page number considered is the PDF number and not the document page number.",
disabled=not st.session_state['pages']
)

st.header("Documentation")
st.markdown("https://github.com/lfoppiano/structure-vision")
st.markdown(
Expand Down Expand Up @@ -114,51 +133,75 @@ def get_file_hash(fname):
help="The full-text is extracted using Grobid. ")

if uploaded_file:
with st.spinner('Reading file, calling Grobid...'):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary
st.session_state['annotations'] = annotations = init_grobid().process_structure(tmp_file.name) if not \
st.session_state['annotations'] else st.session_state['annotations']
if not st.session_state['binary']:
with (st.spinner('Reading file, calling Grobid...')):
binary = uploaded_file.getvalue()
tmp_file = NamedTemporaryFile()
tmp_file.write(bytearray(binary))
st.session_state['binary'] = binary
annotations, pages = init_grobid().process_structure(tmp_file.name)

st.session_state['annotations'] = annotations if not st.session_state['annotations'] else st.session_state[
'annotations']
st.session_state['pages'] = pages if not st.session_state['pages'] else st.session_state['pages']

if st.session_state['pages']:
st.session_state['page_selection'] = placeholder.multiselect(
"Select pages to display",
options=list(range(1, st.session_state['pages'])),
default=[],
help="The page number considered is the PDF number and not the document page number.",
disabled=not st.session_state['pages']
)

with (st.spinner("Rendering PDF document")):
if not highlight_sentences:
annotations = list(filter(lambda a: a['type'] != 's', annotations))
annotations = list(filter(lambda a: a['type'] != 's', st.session_state['annotations']))

if not highlight_paragraphs:
annotations = list(filter(lambda a: a['type'] != 'p', annotations))
annotations = list(filter(lambda a: a['type'] != 'p', st.session_state['annotations']))

if not highlight_title:
annotations = list(filter(lambda a: a['type'] != 'title', annotations))
annotations = list(filter(lambda a: a['type'] != 'title', st.session_state['annotations']))

if not highlight_head:
annotations = list(filter(lambda a: a['type'] != 'head', annotations))
annotations = list(filter(lambda a: a['type'] != 'head', st.session_state['annotations']))

if not highlight_citations:
annotations = list(filter(lambda a: a['type'] != 'biblStruct', annotations))
annotations = list(filter(lambda a: a['type'] != 'biblStruct', st.session_state['annotations']))

if not highlight_notes:
annotations = list(filter(lambda a: a['type'] != 'note', annotations))
annotations = list(filter(lambda a: a['type'] != 'note', st.session_state['annotations']))

if not highlight_callout:
annotations = list(filter(lambda a: a['type'] != 'ref', annotations))
annotations = list(filter(lambda a: a['type'] != 'ref', st.session_state['annotations']))

if not highlight_formulas:
annotations = list(filter(lambda a: a['type'] != 'formula', annotations))
annotations = list(filter(lambda a: a['type'] != 'formula', st.session_state['annotations']))

if not highlight_person_names:
annotations = list(filter(lambda a: a['type'] != 'persName', annotations))
annotations = list(filter(lambda a: a['type'] != 'persName', st.session_state['annotations']))

if not highlight_figures:
annotations = list(filter(lambda a: a['type'] != 'figure', annotations))
annotations = list(filter(lambda a: a['type'] != 'figure', st.session_state['annotations']))

if not highlight_affiliations:
annotations = list(filter(lambda a: a['type'] != '', annotations))
annotations = list(filter(lambda a: a['type'] != '', st.session_state['annotations']))

pdf_viewer(
input=binary,
component = pdf_viewer(
input=st.session_state['binary'],
width=700,
annotations=annotations,
annotations=st.session_state['annotations'],
pages_vertical_spacing=pages_vertical_spacing,
annotation_outline_size=annotation_thickness
annotation_outline_size=annotation_thickness,
pages_to_render=st.session_state['page_selection'],
)

# if st.session_state['pages']:
# st.session_state['page_selection'] = placeholder.multiselect(
# "Select pages to display",
# options=list(range(1, st.session_state['pages'])),
# default=[],
# help="The page number considered is the PDF number and not the document page number.",
# disabled=not st.session_state['pages']
# )

0 comments on commit 9a65d5d

Please sign in to comment.