Upload 5 files
Browse files- .streamlit/config.toml +3 -0
- Demo.py +175 -0
- Dockerfile +70 -0
- pages/Workflow & Model Overview.py +364 -0
- requirements.txt +6 -0
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
base="light"
|
| 3 |
+
primaryColor="#29B4E8"
|
Demo.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import sparknlp
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
from sparknlp.base import *
|
| 7 |
+
from sparknlp.annotator import *
|
| 8 |
+
from pyspark.ml import Pipeline
|
| 9 |
+
from sparknlp.pretrained import PretrainedPipeline
|
| 10 |
+
|
| 11 |
+
# Page configuration
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
layout="wide",
|
| 14 |
+
initial_sidebar_state="auto"
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
# CSS for styling
|
| 18 |
+
st.markdown("""
|
| 19 |
+
<style>
|
| 20 |
+
.main-title {
|
| 21 |
+
font-size: 36px;
|
| 22 |
+
color: #4A90E2;
|
| 23 |
+
font-weight: bold;
|
| 24 |
+
text-align: center;
|
| 25 |
+
}
|
| 26 |
+
.section {
|
| 27 |
+
background-color: #f9f9f9;
|
| 28 |
+
padding: 10px;
|
| 29 |
+
border-radius: 10px;
|
| 30 |
+
margin-top: 10px;
|
| 31 |
+
}
|
| 32 |
+
.section p, .section ul {
|
| 33 |
+
color: #666666;
|
| 34 |
+
}
|
| 35 |
+
</style>
|
| 36 |
+
""", unsafe_allow_html=True)
|
| 37 |
+
|
| 38 |
+
@st.cache_resource
|
| 39 |
+
def init_spark():
|
| 40 |
+
return sparknlp.start()
|
| 41 |
+
|
| 42 |
+
@st.cache_resource
|
| 43 |
+
def create_pipeline(model):
|
| 44 |
+
document_assembler = MultiDocumentAssembler() \
|
| 45 |
+
.setInputCols("table_json", "questions") \
|
| 46 |
+
.setOutputCols("document_table", "document_questions")
|
| 47 |
+
|
| 48 |
+
sentence_detector = SentenceDetector() \
|
| 49 |
+
.setInputCols(["document_questions"]) \
|
| 50 |
+
.setOutputCol("questions")
|
| 51 |
+
|
| 52 |
+
table_assembler = TableAssembler()\
|
| 53 |
+
.setInputCols(["document_table"])\
|
| 54 |
+
.setOutputCol("table")
|
| 55 |
+
|
| 56 |
+
tapas_wtq = TapasForQuestionAnswering\
|
| 57 |
+
.pretrained("table_qa_tapas_base_finetuned_wtq", "en")\
|
| 58 |
+
.setInputCols(["questions", "table"])\
|
| 59 |
+
.setOutputCol("answers_wtq")
|
| 60 |
+
|
| 61 |
+
tapas_sqa = TapasForQuestionAnswering\
|
| 62 |
+
.pretrained("table_qa_tapas_base_finetuned_sqa", "en")\
|
| 63 |
+
.setInputCols(["questions", "table"])\
|
| 64 |
+
.setOutputCol("answers_sqa")
|
| 65 |
+
|
| 66 |
+
pipeline = Pipeline(stages=[document_assembler, sentence_detector, table_assembler, tapas_wtq, tapas_sqa])
|
| 67 |
+
return pipeline
|
| 68 |
+
|
| 69 |
+
def fit_data(pipeline, json_data, question):
|
| 70 |
+
spark_df = spark.createDataFrame([[json_data, question]]).toDF("table_json", "questions")
|
| 71 |
+
model = pipeline.fit(spark_df)
|
| 72 |
+
result = model.transform(spark_df)
|
| 73 |
+
return result.select("answers_wtq.result", "answers_sqa.result").collect()
|
| 74 |
+
|
| 75 |
+
# Sidebar content
|
| 76 |
+
model = st.sidebar.selectbox(
|
| 77 |
+
"Choose the pretrained model",
|
| 78 |
+
["table_qa_tapas_base_finetuned_wtq", "table_qa_tapas_base_finetuned_sqa"],
|
| 79 |
+
help="For more info about the models visit: https://sparknlp.org/models"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# Set up the page layout
|
| 83 |
+
title = 'TAPAS for Table-Based Question Answering with Spark NLP'
|
| 84 |
+
sub_title = (
|
| 85 |
+
'TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends '
|
| 86 |
+
'the BERT architecture to handle tabular data. Unlike traditional models that require flattening '
|
| 87 |
+
'tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering '
|
| 88 |
+
'questions that involve tabular data.'
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True)
|
| 92 |
+
st.markdown(f'<div class="section"><p>{sub_title}</p></div>', unsafe_allow_html=True)
|
| 93 |
+
|
| 94 |
+
# Reference notebook link in sidebar
|
| 95 |
+
link = """
|
| 96 |
+
<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb">
|
| 97 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
|
| 98 |
+
</a>
|
| 99 |
+
"""
|
| 100 |
+
st.sidebar.markdown('Reference notebook:')
|
| 101 |
+
st.sidebar.markdown(link, unsafe_allow_html=True)
|
| 102 |
+
|
| 103 |
+
# Define the JSON data for the table
|
| 104 |
+
# New JSON data
|
| 105 |
+
json_data = '''
|
| 106 |
+
{
|
| 107 |
+
"header": ["name", "net_worth", "age", "nationality", "company", "industry"],
|
| 108 |
+
"rows": [
|
| 109 |
+
["Elon Musk", "$200,000,000,000", "52", "American", "Tesla, SpaceX", "Automotive, Aerospace"],
|
| 110 |
+
["Jeff Bezos", "$150,000,000,000", "60", "American", "Amazon", "E-commerce"],
|
| 111 |
+
["Bernard Arnault", "$210,000,000,000", "74", "French", "LVMH", "Luxury Goods"],
|
| 112 |
+
["Bill Gates", "$120,000,000,000", "68", "American", "Microsoft", "Technology"],
|
| 113 |
+
["Warren Buffett", "$110,000,000,000", "93", "American", "Berkshire Hathaway", "Conglomerate"],
|
| 114 |
+
["Larry Page", "$100,000,000,000", "51", "American", "Google", "Technology"],
|
| 115 |
+
["Mark Zuckerberg", "$85,000,000,000", "40", "American", "Meta", "Social Media"],
|
| 116 |
+
["Mukesh Ambani", "$80,000,000,000", "67", "Indian", "Reliance Industries", "Conglomerate"],
|
| 117 |
+
["Alice Walton", "$65,000,000,000", "74", "American", "Walmart", "Retail"],
|
| 118 |
+
["Francoise Bettencourt Meyers", "$70,000,000,000", "70", "French", "L'Oreal", "Cosmetics"],
|
| 119 |
+
["Amancio Ortega", "$75,000,000,000", "88", "Spanish", "Inditex (Zara)", "Retail"],
|
| 120 |
+
["Carlos Slim", "$55,000,000,000", "84", "Mexican", "America Movil", "Telecom"]
|
| 121 |
+
]
|
| 122 |
+
}
|
| 123 |
+
'''
|
| 124 |
+
|
| 125 |
+
# Define queries for selection
|
| 126 |
+
queries = [
|
| 127 |
+
"Who has a higher net worth, Bernard Arnault or Jeff Bezos?",
|
| 128 |
+
"List the top three individuals by net worth.",
|
| 129 |
+
"Who is the richest person in the technology industry?",
|
| 130 |
+
"Which company in the e-commerce industry has the highest net worth?",
|
| 131 |
+
"Who is the oldest billionaire on the list?",
|
| 132 |
+
"Which individual under the age of 60 has the highest net worth?",
|
| 133 |
+
"Who is the wealthiest American, and which company do they own?",
|
| 134 |
+
"Find all French billionaires and list their companies.",
|
| 135 |
+
"How many women are on the list, and what are their total net worths?",
|
| 136 |
+
"Who is the wealthiest non-American on the list?",
|
| 137 |
+
"Find the person who is the youngest and has a net worth over $100 billion.",
|
| 138 |
+
"Who owns companies in more than one industry, and what are those industries?",
|
| 139 |
+
"What is the total net worth of all individuals over 70?",
|
| 140 |
+
"How many billionaires are in the conglomerate industry?"
|
| 141 |
+
]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# Load the JSON data into a DataFrame and display it
|
| 145 |
+
table_data = json.loads(json_data)
|
| 146 |
+
df_table = pd.DataFrame(table_data["rows"], columns=table_data["header"])
|
| 147 |
+
df_table.index += 1
|
| 148 |
+
|
| 149 |
+
st.write("")
|
| 150 |
+
st.write("Context DataFrame (Click To Edit)")
|
| 151 |
+
edited_df = st.data_editor(df_table)
|
| 152 |
+
|
| 153 |
+
# Convert edited DataFrame back to JSON format
|
| 154 |
+
table_json_data = {
|
| 155 |
+
"header": edited_df.columns.tolist(),
|
| 156 |
+
"rows": edited_df.values.tolist()
|
| 157 |
+
}
|
| 158 |
+
table_json_str = json.dumps(table_json_data)
|
| 159 |
+
|
| 160 |
+
# User input for questions
|
| 161 |
+
selected_text = st.selectbox("Question Query", queries)
|
| 162 |
+
custom_input = st.text_input("Try it with your own Question!")
|
| 163 |
+
text_to_analyze = custom_input if custom_input else selected_text
|
| 164 |
+
|
| 165 |
+
# Initialize Spark and create the pipeline
|
| 166 |
+
spark = init_spark()
|
| 167 |
+
pipeline = create_pipeline(model)
|
| 168 |
+
|
| 169 |
+
# Run the pipeline with the selected query and the converted table data
|
| 170 |
+
output = fit_data(pipeline, table_json_str, text_to_analyze)
|
| 171 |
+
|
| 172 |
+
# Display the output
|
| 173 |
+
st.markdown("---")
|
| 174 |
+
st.subheader("Processed output:")
|
| 175 |
+
st.write("**Answer:**", ', '.join(output[0][0]))
|
Dockerfile
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Download base image ubuntu 18.04
|
| 2 |
+
FROM ubuntu:18.04
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV NB_USER jovyan
|
| 6 |
+
ENV NB_UID 1000
|
| 7 |
+
ENV HOME /home/${NB_USER}
|
| 8 |
+
|
| 9 |
+
# Install required packages
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
tar \
|
| 12 |
+
wget \
|
| 13 |
+
bash \
|
| 14 |
+
rsync \
|
| 15 |
+
gcc \
|
| 16 |
+
libfreetype6-dev \
|
| 17 |
+
libhdf5-serial-dev \
|
| 18 |
+
libpng-dev \
|
| 19 |
+
libzmq3-dev \
|
| 20 |
+
python3 \
|
| 21 |
+
python3-dev \
|
| 22 |
+
python3-pip \
|
| 23 |
+
unzip \
|
| 24 |
+
pkg-config \
|
| 25 |
+
software-properties-common \
|
| 26 |
+
graphviz \
|
| 27 |
+
openjdk-8-jdk \
|
| 28 |
+
ant \
|
| 29 |
+
ca-certificates-java \
|
| 30 |
+
&& apt-get clean \
|
| 31 |
+
&& update-ca-certificates -f;
|
| 32 |
+
|
| 33 |
+
# Install Python 3.8 and pip
|
| 34 |
+
RUN add-apt-repository ppa:deadsnakes/ppa \
|
| 35 |
+
&& apt-get update \
|
| 36 |
+
&& apt-get install -y python3.8 python3-pip \
|
| 37 |
+
&& apt-get clean;
|
| 38 |
+
|
| 39 |
+
# Set up JAVA_HOME
|
| 40 |
+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
|
| 41 |
+
RUN mkdir -p ${HOME} \
|
| 42 |
+
&& echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/" >> ${HOME}/.bashrc \
|
| 43 |
+
&& chown -R ${NB_UID}:${NB_UID} ${HOME}
|
| 44 |
+
|
| 45 |
+
# Create a new user named "jovyan" with user ID 1000
|
| 46 |
+
RUN useradd -m -u ${NB_UID} ${NB_USER}
|
| 47 |
+
|
| 48 |
+
# Switch to the "jovyan" user
|
| 49 |
+
USER ${NB_USER}
|
| 50 |
+
|
| 51 |
+
# Set home and path variables for the user
|
| 52 |
+
ENV HOME=/home/${NB_USER} \
|
| 53 |
+
PATH=/home/${NB_USER}/.local/bin:$PATH
|
| 54 |
+
|
| 55 |
+
# Set the working directory to the user's home directory
|
| 56 |
+
WORKDIR ${HOME}
|
| 57 |
+
|
| 58 |
+
# Upgrade pip and install Python dependencies
|
| 59 |
+
RUN python3.8 -m pip install --upgrade pip
|
| 60 |
+
COPY requirements.txt /tmp/requirements.txt
|
| 61 |
+
RUN python3.8 -m pip install -r /tmp/requirements.txt
|
| 62 |
+
|
| 63 |
+
# Copy the application code into the container at /home/jovyan
|
| 64 |
+
COPY --chown=${NB_USER}:${NB_USER} . ${HOME}
|
| 65 |
+
|
| 66 |
+
# Expose port for Streamlit
|
| 67 |
+
EXPOSE 7860
|
| 68 |
+
|
| 69 |
+
# Define the entry point for the container
|
| 70 |
+
ENTRYPOINT ["streamlit", "run", "Demo.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
pages/Workflow & Model Overview.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# Custom CSS for better styling
|
| 5 |
+
st.markdown("""
|
| 6 |
+
<style>
|
| 7 |
+
.main-title {
|
| 8 |
+
font-size: 36px;
|
| 9 |
+
color: #4A90E2;
|
| 10 |
+
font-weight: bold;
|
| 11 |
+
text-align: center;
|
| 12 |
+
}
|
| 13 |
+
.sub-title {
|
| 14 |
+
font-size: 24px;
|
| 15 |
+
color: #4A90E2;
|
| 16 |
+
margin-top: 20px;
|
| 17 |
+
}
|
| 18 |
+
.section {
|
| 19 |
+
background-color: #f9f9f9;
|
| 20 |
+
padding: 15px;
|
| 21 |
+
border-radius: 10px;
|
| 22 |
+
margin-top: 20px;
|
| 23 |
+
}
|
| 24 |
+
.section p, .section ul {
|
| 25 |
+
color: #666666;
|
| 26 |
+
}
|
| 27 |
+
.link {
|
| 28 |
+
color: #4A90E2;
|
| 29 |
+
text-decoration: none;
|
| 30 |
+
}
|
| 31 |
+
h2 {
|
| 32 |
+
color: #4A90E2;
|
| 33 |
+
font-size: 28px;
|
| 34 |
+
font-weight: bold;
|
| 35 |
+
margin-top: 30px;
|
| 36 |
+
}
|
| 37 |
+
h3 {
|
| 38 |
+
color: #4A90E2;
|
| 39 |
+
font-size: 22px;
|
| 40 |
+
font-weight: bold;
|
| 41 |
+
margin-top: 20px;
|
| 42 |
+
}
|
| 43 |
+
h4 {
|
| 44 |
+
color: #4A90E2;
|
| 45 |
+
font-size: 18px;
|
| 46 |
+
font-weight: bold;
|
| 47 |
+
margin-top: 15px;
|
| 48 |
+
}
|
| 49 |
+
</style>
|
| 50 |
+
""", unsafe_allow_html=True)
|
| 51 |
+
|
| 52 |
+
# Main Title
|
| 53 |
+
st.markdown('<div class="main-title">Question Answering Over Tables with TAPAS and Spark NLP</div>', unsafe_allow_html=True)
|
| 54 |
+
|
| 55 |
+
# Overview Section
|
| 56 |
+
st.markdown("""
|
| 57 |
+
<div class="section">
|
| 58 |
+
<p>As data becomes increasingly complex, extracting meaningful insights from tabular data is more important than ever. TAPAS, a transformer-based model developed by Google, is designed specifically to handle question-answering over tables. By combining TAPAS with Spark NLP, we can leverage the power of distributed computing to process large datasets efficiently.</p>
|
| 59 |
+
<p>This guide will walk you through the process of setting up TAPAS in Spark NLP, implementing two specific models (<code>table_qa_tapas_base_finetuned_wtq</code> and <code>table_qa_tapas_base_finetuned_sqa</code>), and understanding their best use cases.</p>
|
| 60 |
+
</div>
|
| 61 |
+
""", unsafe_allow_html=True)
|
| 62 |
+
|
| 63 |
+
# Introduction to TAPAS and Spark NLP
|
| 64 |
+
st.markdown('<div class="sub-title">Introduction to TAPAS and Spark NLP</div>', unsafe_allow_html=True)
|
| 65 |
+
|
| 66 |
+
# What is TAPAS?
|
| 67 |
+
st.markdown("""
|
| 68 |
+
<div class="section">
|
| 69 |
+
<h3>What is TAPAS?</h3>
|
| 70 |
+
<p>TAPAS (Table Parsing Supervised via Pre-trained Language Models) is a model that extends the BERT architecture to handle tabular data. Unlike traditional models that require flattening tables into text, TAPAS can directly interpret tables, making it a powerful tool for answering questions that involve tabular data.</p>
|
| 71 |
+
</div>
|
| 72 |
+
""", unsafe_allow_html=True)
|
| 73 |
+
|
| 74 |
+
# Why Use TAPAS with Spark NLP?
|
| 75 |
+
st.markdown("""
|
| 76 |
+
<div class="section">
|
| 77 |
+
<h3>Why Use TAPAS with Spark NLP?</h3>
|
| 78 |
+
<p>Spark NLP, developed by John Snow Labs, is an open-source library that provides state-of-the-art natural language processing capabilities within a distributed computing framework. Integrating TAPAS with Spark NLP allows you to scale your question-answering tasks across large datasets, making it ideal for big data environments.</p>
|
| 79 |
+
</div>
|
| 80 |
+
""", unsafe_allow_html=True)
|
| 81 |
+
|
| 82 |
+
# Pipeline and Results
|
| 83 |
+
st.markdown('<div class="sub-title">Pipeline and Results</div>', unsafe_allow_html=True)
|
| 84 |
+
|
| 85 |
+
st.markdown("""
|
| 86 |
+
<div class="section">
|
| 87 |
+
<p>In this section, we’ll build a pipeline using Spark NLP to process a table and answer questions about the data it contains. We will utilize two different TAPAS models, each suited for different types of queries.</p>
|
| 88 |
+
</div>
|
| 89 |
+
""", unsafe_allow_html=True)
|
| 90 |
+
|
| 91 |
+
# Step 1: Creating the Data
|
| 92 |
+
st.markdown("""
|
| 93 |
+
<div class="section">
|
| 94 |
+
<h4>Step 1: Creating the Data</h4>
|
| 95 |
+
<p>We'll start by creating a Spark DataFrame that includes a table in JSON format and a set of questions.</p>
|
| 96 |
+
""", unsafe_allow_html=True)
|
| 97 |
+
|
| 98 |
+
st.code("""
|
| 99 |
+
json_data = '''
|
| 100 |
+
{
|
| 101 |
+
"header": ["name", "money", "age"],
|
| 102 |
+
"rows": [
|
| 103 |
+
["Donald Trump", "$100,000,000", "75"],
|
| 104 |
+
["Elon Musk", "$20,000,000,000,000", "55"]
|
| 105 |
+
]
|
| 106 |
+
}
|
| 107 |
+
'''
|
| 108 |
+
|
| 109 |
+
queries = [
|
| 110 |
+
"Who earns less than 200,000,000?",
|
| 111 |
+
"Who earns 100,000,000?",
|
| 112 |
+
"How much money has Donald Trump?",
|
| 113 |
+
"How old are they?",
|
| 114 |
+
"How much money have they total?",
|
| 115 |
+
"Who earns more than Donald Trump?"
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
data = spark.createDataFrame([[json_data, " ".join(queries)]])\\
|
| 119 |
+
.toDF("table_json", "questions")
|
| 120 |
+
""", language="python")
|
| 121 |
+
|
| 122 |
+
# Step 2: Assembling the Pipeline
|
| 123 |
+
st.markdown("""
|
| 124 |
+
<div class="section">
|
| 125 |
+
<h4>Step 2: Assembling the Pipeline</h4>
|
| 126 |
+
<p>We will now set up a Spark NLP pipeline that includes the necessary annotators for processing the table and questions.</p>
|
| 127 |
+
""", unsafe_allow_html=True)
|
| 128 |
+
|
| 129 |
+
st.code("""
|
| 130 |
+
from sparknlp.annotator import TapasForQuestionAnswering, SentenceDetector
|
| 131 |
+
from sparknlp.base import MultiDocumentAssembler, TableAssembler
|
| 132 |
+
from pyspark.ml import Pipeline
|
| 133 |
+
from pyspark.sql import functions as F
|
| 134 |
+
|
| 135 |
+
# Step 1: Transforms raw texts to `document` annotation
|
| 136 |
+
document_assembler = MultiDocumentAssembler() \\
|
| 137 |
+
.setInputCols("table_json", "questions") \\
|
| 138 |
+
.setOutputCols("document_table", "document_questions")
|
| 139 |
+
|
| 140 |
+
# Step 2: Getting the sentences
|
| 141 |
+
sentence_detector = SentenceDetector() \\
|
| 142 |
+
.setInputCols(["document_questions"]) \\
|
| 143 |
+
.setOutputCol("questions")
|
| 144 |
+
|
| 145 |
+
# Step 3: Get the tables
|
| 146 |
+
table_assembler = TableAssembler()\\
|
| 147 |
+
.setInputCols(["document_table"])\\
|
| 148 |
+
.setOutputCol("table")
|
| 149 |
+
|
| 150 |
+
# WTQ TAPAS model
|
| 151 |
+
tapas_wtq = TapasForQuestionAnswering\\
|
| 152 |
+
.pretrained("table_qa_tapas_base_finetuned_wtq", "en")\\
|
| 153 |
+
.setInputCols(["questions", "table"])\\
|
| 154 |
+
.setOutputCol("answers_wtq")
|
| 155 |
+
|
| 156 |
+
# SQA TAPAS model
|
| 157 |
+
tapas_sqa = TapasForQuestionAnswering\\
|
| 158 |
+
.pretrained("table_qa_tapas_base_finetuned_sqa", "en")\\
|
| 159 |
+
.setInputCols(["questions", "table"])\\
|
| 160 |
+
.setOutputCol("answers_sqa")
|
| 161 |
+
|
| 162 |
+
# Define pipeline
|
| 163 |
+
pipeline = Pipeline(stages=[
|
| 164 |
+
document_assembler,
|
| 165 |
+
sentence_detector,
|
| 166 |
+
table_assembler,
|
| 167 |
+
tapas_wtq,
|
| 168 |
+
tapas_sqa
|
| 169 |
+
])
|
| 170 |
+
|
| 171 |
+
# Fit and transform data
|
| 172 |
+
model = pipeline.fit(data)
|
| 173 |
+
result = model.transform(data)
|
| 174 |
+
""", language="python")
|
| 175 |
+
|
| 176 |
+
# Step 3: Viewing the Results
|
| 177 |
+
st.markdown("""
|
| 178 |
+
<div class="section">
|
| 179 |
+
<h4>Step 3: Viewing the Results</h4>
|
| 180 |
+
<p>After processing, we can explore the results generated by each model:</p>
|
| 181 |
+
""", unsafe_allow_html=True)
|
| 182 |
+
|
| 183 |
+
st.code("""
|
| 184 |
+
# WTQ Model Results:
|
| 185 |
+
result.select(F.explode(result.answers_wtq)).show(truncate=False)
|
| 186 |
+
""", language="python")
|
| 187 |
+
|
| 188 |
+
st.text("""
|
| 189 |
+
+--------------------------------------+
|
| 190 |
+
|col |
|
| 191 |
+
+--------------------------------------+
|
| 192 |
+
|Donald Trump |
|
| 193 |
+
|Donald Trump |
|
| 194 |
+
|SUM($100,000,000) |
|
| 195 |
+
|AVERAGE(75, 55) |
|
| 196 |
+
|SUM($100,000,000, $20,000,000,000,000)|
|
| 197 |
+
|Elon Musk |
|
| 198 |
+
+--------------------------------------+
|
| 199 |
+
""")
|
| 200 |
+
|
| 201 |
+
st.code("""
|
| 202 |
+
# SQA Model Results:
|
| 203 |
+
result.select(F.explode(result.answers_sqa)).show(truncate=False)
|
| 204 |
+
""", language="python")
|
| 205 |
+
|
| 206 |
+
st.text("""
|
| 207 |
+
+---------------------------------+
|
| 208 |
+
|col |
|
| 209 |
+
+---------------------------------+
|
| 210 |
+
|Donald Trump |
|
| 211 |
+
|Donald Trump |
|
| 212 |
+
|$100,000,000 |
|
| 213 |
+
|75, 55 |
|
| 214 |
+
|$100,000,000, $20,000,000,000,000|
|
| 215 |
+
|Elon Musk |
|
| 216 |
+
+---------------------------------+
|
| 217 |
+
""")
|
| 218 |
+
|
| 219 |
+
# Comparing Results
|
| 220 |
+
st.markdown("""
|
| 221 |
+
<div class="section">
|
| 222 |
+
<h4>Comparing Results</h4>
|
| 223 |
+
<p>To better understand the differences, we can compare the results from both models side by side:</p>
|
| 224 |
+
""", unsafe_allow_html=True)
|
| 225 |
+
|
| 226 |
+
st.code("""
|
| 227 |
+
result.select(F.explode(F.arrays_zip(result.questions.result,
|
| 228 |
+
result.answers_sqa.result,
|
| 229 |
+
result.answers_wtq.result)).alias("cols"))\\
|
| 230 |
+
.select(F.expr("cols['0']").alias("question"),
|
| 231 |
+
F.expr("cols['1']").alias("answer_sqa"),
|
| 232 |
+
F.expr("cols['2']").alias("answer_wtq")).show(truncate=False)
|
| 233 |
+
""", language="python")
|
| 234 |
+
|
| 235 |
+
st.text("""
|
| 236 |
+
+---------------------------------+---------------------------------+--------------------------------------+
|
| 237 |
+
|question |answer_sqa |answer_wtq |
|
| 238 |
+
+---------------------------------+---------------------------------+--------------------------------------+
|
| 239 |
+
|Who earns less than 200,000,000? |Donald Trump |Donald Trump |
|
| 240 |
+
|Who earns 100,000,000? |Donald Trump |Donald Trump |
|
| 241 |
+
|How much money has Donald Trump? |$100,000,000 |SUM($100,000,000) |
|
| 242 |
+
|How old are they? |75, 55 |AVERAGE(75, 55) |
|
| 243 |
+
|How much money have they total? |$100,000,000, $20,000,000,000,000|SUM($100,000,000, $20,000,000,000,000)|
|
| 244 |
+
|Who earns more than Donald Trump?|Elon Musk |Elon Musk |
|
| 245 |
+
+---------------------------------+---------------------------------+--------------------------------------+
|
| 246 |
+
""")
|
| 247 |
+
|
| 248 |
+
# One-Liner Alternative
|
| 249 |
+
st.markdown("""
|
| 250 |
+
<div class="section">
|
| 251 |
+
<h4>One-Liner Alternative</h4>
|
| 252 |
+
<p>For those who prefer a simpler approach, John Snow Labs offers a one-liner API to quickly get answers using TAPAS models.</p>
|
| 253 |
+
""", unsafe_allow_html=True)
|
| 254 |
+
|
| 255 |
+
st.code("""
|
| 256 |
+
#Downliad the johnsnowlabs library
|
| 257 |
+
pip install johnsnowlabs
|
| 258 |
+
""", language="bash")
|
| 259 |
+
|
| 260 |
+
st.code("""
|
| 261 |
+
import pandas as pd
|
| 262 |
+
from johnsnowlabs import nlp
|
| 263 |
+
|
| 264 |
+
# Create the context DataFrame
|
| 265 |
+
context_df = pd.DataFrame({
|
| 266 |
+
'name': ['Donald Trump', 'Elon Musk'],
|
| 267 |
+
'money': ['$100,000,000', '$20,000,000,000,000'],
|
| 268 |
+
'age': ['75', '55']
|
| 269 |
+
})
|
| 270 |
+
|
| 271 |
+
# Define the questions
|
| 272 |
+
questions = [
|
| 273 |
+
"Who earns less than 200,000,000?",
|
| 274 |
+
"Who earns 100,000,000?",
|
| 275 |
+
"How much money has Donald Trump?",
|
| 276 |
+
"How old are they?",
|
| 277 |
+
"How much money have they total?",
|
| 278 |
+
"Who earns more than Donald Trump?"
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
# Combine context and questions into a tuple
|
| 282 |
+
tapas_data = (context_df, questions)
|
| 283 |
+
|
| 284 |
+
# Use the one-liner API with the WTQ model
|
| 285 |
+
answers_wtq = nlp.load('en.answer_question.tapas.wtq.large_finetuned').predict(tapas_data)
|
| 286 |
+
answers_wtq[['sentence', 'tapas_qa_UNIQUE_answer']]
|
| 287 |
+
""", language="python")
|
| 288 |
+
|
| 289 |
+
# Define the data as a list of dictionaries
|
| 290 |
+
data = {
|
| 291 |
+
"sentence": [
|
| 292 |
+
"Who earns less than 200,000,000?",
|
| 293 |
+
"Who earns 100,000,000?",
|
| 294 |
+
"How much money has Donald Trump?",
|
| 295 |
+
"How old are they?",
|
| 296 |
+
"How much money have they total? Who earns more..."
|
| 297 |
+
],
|
| 298 |
+
"tapas_qa_UNIQUE_answer": [
|
| 299 |
+
"Donald Trump",
|
| 300 |
+
"Donald Trump",
|
| 301 |
+
"SUM($100,000,000)",
|
| 302 |
+
"SUM(55)",
|
| 303 |
+
"SUM($20,000,000,000,000)"
|
| 304 |
+
]
|
| 305 |
+
}
|
| 306 |
+
st.dataframe(pd.DataFrame(data))
|
| 307 |
+
|
| 308 |
+
# Model Information and Use Cases
|
| 309 |
+
st.markdown("""
|
| 310 |
+
<div class="section">
|
| 311 |
+
<h4>Model Information and Use Cases</h4>
|
| 312 |
+
<p>Understanding the strengths of each TAPAS model can help you choose the right tool for your task.</p>
|
| 313 |
+
<ul>
|
| 314 |
+
<li><b>table_qa_tapas_base_finetuned_wtq</b></li>
|
| 315 |
+
<ul>
|
| 316 |
+
<li>Best for: answering questions involving table-wide aggregation (e.g., sums, averages).</li>
|
| 317 |
+
</ul>
|
| 318 |
+
<li><b>table_qa_tapas_base_finetuned_sqa</b></li>
|
| 319 |
+
<ul>
|
| 320 |
+
<li>Best for: answering questions in a sequential question-answering context, where the current question depends on previous answers.</li>
|
| 321 |
+
</ul>
|
| 322 |
+
</ul>
|
| 323 |
+
</div>
|
| 324 |
+
""", unsafe_allow_html=True)
|
| 325 |
+
|
| 326 |
+
# Conclusion
|
| 327 |
+
st.markdown("""
|
| 328 |
+
<div class="section">
|
| 329 |
+
<h4>Conclusion</h4>
|
| 330 |
+
<p>TAPAS, integrated with Spark NLP, provides a powerful solution for question-answering over tables, capable of handling both complex aggregation queries and straightforward Q&A tasks. Whether you're working with large datasets or simple tables, TAPAS offers flexibility and scalability. The <code>table_qa_tapas_base_finetuned_wtq</code> model excels in aggregation tasks, while <code>table_qa_tapas_base_finetuned_sqa</code> is best for direct, sequential question-answering.</p>
|
| 331 |
+
<p>By following this guide, you can efficiently implement TAPAS in your own projects, leveraging Spark NLP's powerful processing capabilities to extract insights from your data.</p>
|
| 332 |
+
</div>
|
| 333 |
+
""", unsafe_allow_html=True)
|
| 334 |
+
|
| 335 |
+
# References
|
| 336 |
+
st.markdown("""
|
| 337 |
+
<div class="section">
|
| 338 |
+
<h4>References</h4>
|
| 339 |
+
<ul>
|
| 340 |
+
<li>Documentation : <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#multidocumentassembler" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/docs/en/annotators#TapasForQuestionAnswering">TapasForQuestionAnswering</a></li>
|
| 341 |
+
<li>Python Doc : <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/base/multi_document_assembler/index.html#sparknlp.base.multi_document_assembler.MultiDocumentAssembler.setIdCol" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/tapas_for_question_answering/index.html" target="_blank" rel="noopener">TapasForQuestionAnswering</a></li>
|
| 342 |
+
<li>Scala Doc : <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/MultiDocumentAssembler.html" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/classifier/dl/TapasForQuestionAnswering.html">TapasForQuestionAnswering</a></li>
|
| 343 |
+
<li>Models Used : <a class="link" href="https://sparknlp.org/2022/09/30/table_qa_tapas_base_finetuned_wtq_en.html" target="_blank" rel="noopener">table_qa_tapas_base_finetuned_wtq</a>, <a class="link" href="https://sparknlp.org/2022/09/30/table_qa_tapas_base_finetuned_sqa_en.html">table_qa_tapas_base_finetuned_sqa</a></li>
|
| 344 |
+
<li>For extended examples of usage, see the notebooks for <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-assembler/Loading_Multiple_Documents.ipynb" target="_blank" rel="noopener">MultiDocumentAssembler</a>, <a class="link" href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Public/15.1_Table_Question_Answering.ipynb" target="_blank" rel="noopener">TapasForQuestionAnswering</a>.</li>
|
| 345 |
+
<li><a href="https://arxiv.org/abs/2004.02349" class="link" target="_blank">TAPAS: Weakly Supervised Table Parsing via Pre-trained Language Models</a></li>
|
| 346 |
+
<li><a href="https://nlp.johnsnowlabs.com/" class="link" target="_blank">Spark NLP Documentation</a></li>
|
| 347 |
+
<li><a href="https://nlp.johnsnowlabs.com/models" class="link" target="_blank">John Snow Labs Models Hub</a></li>
|
| 348 |
+
</ul>
|
| 349 |
+
</div>
|
| 350 |
+
""", unsafe_allow_html=True)
|
| 351 |
+
|
| 352 |
+
# Community & Support
|
| 353 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
| 354 |
+
st.markdown("""
|
| 355 |
+
<div class="section">
|
| 356 |
+
<ul>
|
| 357 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
| 358 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
| 359 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
| 360 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
| 361 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
| 362 |
+
</ul>
|
| 363 |
+
</div>
|
| 364 |
+
""", unsafe_allow_html=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
st-annotated-text
|
| 3 |
+
pandas
|
| 4 |
+
numpy
|
| 5 |
+
spark-nlp
|
| 6 |
+
pyspark
|