diff --git a/README.md b/README.md
index edb639d..1eea004 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# InfiCoder-Eval
+# InfiAgent
 
-This is the repository that contains source code for the [InfiCoder-Eval](https://github.com/infi-coder/inficoder-eval).
+This is the repository that contains source code for the [InfiAgent](https://infi-coder.github.io/inficoder-eval/) website.
 
 ```
 
diff --git a/_config.yaml b/_config.yaml
new file mode 100644
index 0000000..9408c5a
--- /dev/null
+++ b/_config.yaml
@@ -0,0 +1,91 @@
+# Welcome to Jekyll!
+#
+# This config file is meant for settings that affect your whole blog, values
+# which you are expected to set up once and rarely edit after that. If you find
+# yourself editing this file very often, consider using Jekyll's data files
+# feature for the data you need to update frequently.
+#
+# For technical reasons, this file is *NOT* reloaded automatically when you use
+# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
+#
+# If you need help with YAML syntax, here are some quick references for you: 
+# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml
+# https://learnxinyminutes.com/docs/yaml/
+#
+# Site settings
+# These are used to personalize your new site. If you look in the HTML files,
+# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
+# You can create any custom variable you would like, and they will be accessible
+# in the templates via {{ site.myvariable }}.
+
+title: "InfiCoder-Eval: Systematically Evaluating Question-Answering"
+# email: anonymous@github.com
+description: >- # this means to ignore newlines until "baseurl:"
+  This is the website for the InfiCoder-Eval benchmark
+baseurl: "/inficoder-eval" # the subpath of your site, e.g. /blog
+url: "https://infi-coder.github.io/" # the base hostname & protocol for your site, e.g. http://example.com
+# twitter_username: sokcertifiedrobustness
+github_username:  llylly
+
+# Build settings
+# theme: minima
+# plugins:
+  # - jekyll-feed
+
+
+minima:
+  date_format: "%b %-d, %Y"
+
+  # generate social links in footer
+  # social_links:
+    # twitter: jekyllrb
+    # github:  sokcertifiedrobustness
+    # devto: jekyll
+    # dribbble: jekyll
+    # facebook: jekyll
+    # flickr:   jekyll
+    # instagram: jekyll
+    # linkedin: jekyll
+    # pinterest: jekyll
+    # youtube: jekyll
+    # youtube_channel: UC8CXR0-3I70i1tfPg1PAE1g
+    # youtube_channel_name: CloudCannon
+    # telegram: jekyll
+    # googleplus: +jekyll
+    # microdotblog: jekyll
+    # keybase: jekyll
+
+    # Mastodon instances
+    # mastodon:
+    # - username: jekyll
+    #   instance: example.com
+    # - username: jekyll2
+    #   instance: example.com
+
+    # GitLab instances
+    # gitlab:
+    # - username: jekyll
+    #   instance: example.com
+    # - username: jekyll2
+    #   instance: example.com
+
+# Exclude from processing.
+# The following items will not be processed, by default.
+# Any item listed under the `exclude:` key here will be automatically added to
+# the internal "default list".
+#
+# Excluded items can be processed by explicitly listing the directories or
+# their entries' file path in the `include:` list.
+#
+# exclude:
+#   - index.html
+#   - .sass-cache/
+#   - .jekyll-cache/
+#   - gemfiles/
+#   - Gemfile
+#   - Gemfile.lock
+#   - node_modules/
+#   - vendor/bundle/
+#   - vendor/cache/
+#   - vendor/gems/
+#   - vendor/ruby/
\ No newline at end of file
diff --git a/_data/leaderboard.yml b/_data/leaderboard.yml
index 5be2082..dc98cf1 100644
--- a/_data/leaderboard.yml
+++ b/_data/leaderboard.yml
@@ -1,365 +1,178 @@
 records:
 - comment: null
-  devscore: 61.28%
-  devscore_std: 1.45%
+  aps_score: 74.60%
+  as_score: 78.72%
+  a_score: 79.01%
   link: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
   rank: 1
-  score: 58.89%
-  score_std: 0.74%
   size: null
-  testscore: 56.49%
-  testscore_std: 0.19%
-  title: GPT-4 (0613)
+  title: gpt-4-0613
 - comment: null
-  devscore: 50.13%
-  devscore_std: null
-  link: https://deepseekcoder.github.io/
+  aps_score: 60.77%
+  as_score: 67.50%
+  a_score: 62.98%
+  link: https://arxiv.org/abs/2401.05507
   rank: 2
-  score: 50.34%
-  score_std: null
-  size: 33
-  testscore: 50.55%
-  testscore_std: null
-  title: deepseek-coder-33b-instruct
+  size: 34
+  title: daagent-34b 
 - comment: null
-  devscore: 46.16%
-  devscore_std: 0.72%
-  link: https://platform.openai.com/docs/models/gpt-3-5
+  aps_score: 60.13%
+  as_score: 65.94%
+  a_score: 64.27%
+  link: https://api.minimax.chat/
   rank: 3
-  score: 46.70%
-  score_std: 0.70%
   size: null
-  testscore: 47.24%
-  testscore_std: 1.75%
-  title: GPT-3.5-turbo (0613)
+  title: abab5.5-chat  
 - comment: null
-  devscore: 46.52%
-  devscore_std: null
-  link: https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0
+  aps_score: 58.20%
+  as_score: 65.70%
+  a_score: 61.88%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 4
-  score: 45.16%
-  score_std: null
-  size: 34
-  testscore: 43.80%
-  testscore_std: null
-  title: WizardCoder-Python-34B-V1.0
+  size: null
+  title: gpt-3.5-turbo-0613
 - comment: null
-  devscore: 44.03%
-  devscore_std: null
-  link: https://github.com/facebookresearch/codellama
+  aps_score: 57.56%
+  as_score: 62.46%
+  a_score: 56.17%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 5
-  score: 43.71%
-  score_std: null
-  size: 34
-  testscore: 43.39%
-  testscore_std: null
-  title: CodeLlama-34B-Instruct
+  size: 72
+  title: qwen-72b-chat
 - comment: null
-  devscore: 42.28%
-  devscore_std: 0.24%
-  link: https://deepseekcoder.github.io/
+  aps_score: 54.52%
+  as_score: 60.51%
+  a_score: 55.72%
+  link: https://arxiv.org/abs/2401.05507
   rank: 6
-  score: 42.97%
-  score_std: 0.22%
-  size: 6.7
-  testscore: 43.66%
-  testscore_std: 0.26%
-  title: deepseek-coder-6.7b-instruct
+  size: 13
+  title: daagent-13b 
 - comment: null
-  devscore: 41.46%
-  devscore_std: 1.13%
-  link: https://huggingface.co/WizardLM/WizardCoder-Python-13B-V1.0
+  aps_score: 53.38%
+  as_score: 58.32%
+  a_score: 51.93%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 7
-  score: 41.22%
-  score_std: 0.75%
-  size: 13
-  testscore: 40.97%
-  testscore_std: 0.42%
-  title: WizardCoder-Python-13B-V1.0
+  size: null
+  title: gemini-pro
 - comment: null
-  devscore: 43.70%
-  devscore_std: 2.09%
-  link: https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0
+  aps_score: 49.20%
+  as_score: 54.02%
+  a_score: 51.38%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 8
-  score: 40.30%
-  score_std: 1.15%
-  size: 7
-  testscore: 36.90%
-  testscore_std: 0.22%
-  title: WizardCoder-Python-7B-V1.0
+  size: 46.7(12.9)
+  title: mixtral-8x7b-instruct-v0.1
 - comment: null
-  devscore: 39.71%
-  devscore_std: null
-  link: https://github.com/facebookresearch/codellama
+  aps_score: 49.02%
+  as_score: 57.63%
+  a_score: 54.19%
+  link: https://arxiv.org/abs/2401.05507
   rank: 9
-  score: 39.75%
-  score_std: null
-  size: 34
-  testscore: 39.79%
-  testscore_std: null
-  title: CodeLlama-34B
+  size: 7
+  title: daagent-7b
 - comment: null
-  devscore: 41.20%
-  devscore_std: 0.25%
-  link: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+  aps_score: 44.84%
+  as_score: 48.90%
+  a_score: 46.31%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 10
-  score: 39.59%
-  score_std: 0.68%
-  size: 7
-  testscore: 37.97%
-  testscore_std: 1.29%
-  title: Zypher-7b-beta
+  size: 33
+  title: deepseek-coder-33b-instruct
 - comment: null
-  devscore: 35.45%
-  devscore_std: 0.56%
-  link: https://huggingface.co/bigcode/octocoder
+  aps_score: 43.41%
+  as_score: 49.65%
+  a_score: 46.96%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 11
-  score: 37.72%
-  score_std: 0.58%
-  size: 15.5
-  testscore: 40.00%
-  testscore_std: 1.69%
-  title: OctoCoder
+  size: null
+  title: claude-2.1
 - comment: null
-  devscore: 36.03%
-  devscore_std: 1.37%
-  link: https://github.com/facebookresearch/codellama
+  aps_score: 42.02%
+  as_score: 47.41%
+  a_score: 44.40%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 12
-  score: 37.18%
-  score_std: 0.51%
-  size: 13
-  testscore: 38.34%
-  testscore_std: 0.36%
-  title: CodeLlama-13B-Instruct
+  size: 34
+  title: phind-codellama-34b-v2
 - comment: null
-  devscore: 36.19%
-  devscore_std: 0.75%
-  link: https://huggingface.co/Qwen/Qwen-14B-Chat
+  aps_score: 39.87%
+  as_score: 45.46%
+  a_score: 42.73%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 13
-  score: 36.93%
-  score_std: 0.12%
-  size: 14
-  testscore: 37.68%
-  testscore_std: 0.91%
-  title: Qwen-14B-Chat
+  size: 34
+  title: xwincoder-34b
 - comment: null
-  devscore: 31.78%
-  devscore_std: 1.01%
-  link: https://github.com/facebookresearch/codellama
+  aps_score: 36.45%
+  as_score: 41.61%
+  a_score: 36.90%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 14
-  score: 33.92%
-  score_std: 0.64%
-  size: 13
-  testscore: 36.06%
-  testscore_std: 0.49%
-  title: CodeLlama-13B
+  size: 7
+  title: mistral-7b-instruct-v0.2
 - comment: null
-  devscore: 31.85%
-  devscore_std: 0.60%
-  link: https://huggingface.co/WizardLM/WizardCoder-15B-V1.0
+  aps_score: 36.45%
+  as_score: 41.36%
+  a_score: 34.51%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 15
-  score: 33.34%
-  score_std: 0.74%
-  size: 15
-  testscore: 34.83%
-  testscore_std: 1.86%
-  title: WizardCoder-15B-V1.0
+  size: 14
+  title: qwen-14b-chat
 - comment: null
-  devscore: 32.10%
-  devscore_std: null
-  link: https://github.com/facebookresearch/codellama
+  aps_score: 27.27%
+  as_score: 27.27%
+  a_score: 16.00%
+  link: https://platform.openai.com/docs/models/gpt-3-5
   rank: 16
-  score: 33.23%
-  score_std: null
-  size: 34
-  testscore: 34.35%
-  testscore_std: null
-  title: CodeLlama-34B-Python
-- comment: null
-  devscore: 31.55%
-  devscore_std: 2.01%
-  link: https://huggingface.co/bigcode/octogeex
-  rank: 17
-  score: 32.60%
-  score_std: 1.02%
-  size: 6
-  testscore: 33.65%
-  testscore_std: 0.91%
-  title: OctoGeeX
-- comment: null
-  devscore: 31.87%
-  devscore_std: 0.66%
-  link: https://huggingface.co/Qwen/Qwen-7B-Chat
-  rank: 18
-  score: 32.48%
-  score_std: 0.71%
-  size: 7
-  testscore: 33.10%
-  testscore_std: 0.77%
-  title: Qwen-7B-Chat
-- comment: null
-  devscore: 31.09%
-  devscore_std: 1.85%
-  link: https://github.com/facebookresearch/codellama
-  rank: 19
-  score: 32.43%
-  score_std: 0.42%
-  size: 13
-  testscore: 33.78%
-  testscore_std: 1.02%
-  title: CodeLlama-13B-Python
-- comment: null
-  devscore: 30.05%
-  devscore_std: 1.79%
-  link: https://github.com/facebookresearch/codellama
-  rank: 20
-  score: 31.07%
-  score_std: 0.87%
   size: 7
-  testscore: 32.09%
-  testscore_std: 0.05%
-  title: CodeLlama-7B
-- comment: null
-  devscore: 31.33%
-  devscore_std: 0.80%
-  link: https://huggingface.co/WizardLM/WizardCoder-3B-V1.0
-  rank: 21
-  score: 30.94%
-  score_std: 0.60%
-  size: 3
-  testscore: 30.56%
-  testscore_std: 0.58%
-  title: WizardCoder-3B-V1.0
+  title: qwen-7b-chat
 - comment: null
-  devscore: 30.15%
-  devscore_std: 0.73%
-  link: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat
-  rank: 22
-  score: 30.34%
-  score_std: 0.76%
+  aps_score: 26.26%
+  as_score: 30.88%
+  a_score: 26.31%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 17
   size: 13
-  testscore: 30.52%
-  testscore_std: 1.32%
-  title: Baichuan2-13B-Chat
-- comment: null
-  devscore: 27.96%
-  devscore_std: 0.81%
-  link: https://github.com/facebookresearch/codellama
-  rank: 23
-  score: 29.51%
-  score_std: 0.97%
-  size: 7
-  testscore: 31.06%
-  testscore_std: 1.18%
-  title: CodeLlama-7B-Instruct
-- comment: null
-  devscore: 27.51%
-  devscore_std: 1.09%
-  link: https://github.com/facebookresearch/codellama
-  rank: 24
-  score: 28.88%
-  score_std: 0.45%
-  size: 7
-  testscore: 30.24%
-  testscore_std: 1.97%
-  title: CodeLlama-7B-Python
+  title: vicuna-13b-v1.5
 - comment: null
-  devscore: 26.53%
-  devscore_std: 1.12%
-  link: https://huggingface.co/WizardLM/WizardCoder-1B-V1.0
-  rank: 25
-  score: 27.11%
-  score_std: 0.85%
-  size: 1
-  testscore: 27.70%
-  testscore_std: 0.72%
-  title: WizardCoder-1B-V1.0
-- comment: null
-  devscore: 28.74%
-  devscore_std: 0.78%
-  link: https://huggingface.co/bigcode/starcoder
-  rank: 26
-  score: 26.79%
-  score_std: 0.18%
-  size: 15.5
-  testscore: 24.84%
-  testscore_std: 0.96%
-  title: StarCoder
-- comment: null
-  devscore: 24.45%
-  devscore_std: 1.55%
-  link: https://huggingface.co/bigcode/starcoderplus
-  rank: 27
-  score: 26.07%
-  score_std: 1.25%
-  size: 15.5
-  testscore: 27.69%
-  testscore_std: 2.13%
-  title: StarCoderPlus
+  aps_score: 23.79%
+  as_score: 26.82%
+  a_score: 25.05%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 17
+  size: 20
+  title: internlm-chat-20b
 - comment: null
-  devscore: 23.36%
-  devscore_std: 1.77%
-  link: https://huggingface.co/Salesforce/codegen25-7b-instruct
-  rank: 28
-  score: 25.67%
-  score_std: 1.57%
-  size: 7
-  testscore: 27.98%
-  testscore_std: 1.46%
-  title: CodeGen2.5-7B-instruct
+  aps_score: 23.13%
+  as_score: 26.45%
+  a_score: 22.40%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 17
+  size: 34
+  title: wizardcoder-python-34b-v1.0
 - comment: null
-  devscore: 25.44%
-  devscore_std: 0.02%
-  link: https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
-  rank: 29
-  score: 24.39%
-  score_std: 0.25%
+  aps_score: 16.99%
+  as_score: 20.71%
+  a_score: 17.89%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 18
   size: 7
-  testscore: 23.34%
-  testscore_std: 0.50%
-  title: Baichuan2-7B-Chat
-- comment: null
-  devscore: 17.47%
-  devscore_std: 1.56%
-  link: https://platform.openai.com/docs/models/gpt-base
-  rank: 30
-  score: 19.08%
-  score_std: 1.00%
-  size: /
-  testscore: 20.70%
-  testscore_std: 0.70%
-  title: davinci-002
+  title: agentlm-7b
 - comment: null
-  devscore: 14.10%
-  devscore_std: 0.84%
-  link: https://huggingface.co/microsoft/phi-1_5
-  rank: 31
-  score: 16.63%
-  score_std: 0.03%
-  size: 1.5
-  testscore: 19.16%
-  testscore_std: 0.87%
-  title: phi-1.5
-- comment: null
-  devscore: 16.20%
-  devscore_std: 0.65%
-  link: https://github.com/THUDM/CodeGeeX2
-  rank: 32
-  score: 16.50%
-  score_std: 0.39%
+  aps_score: 16.67%
+  as_score: 20.59%
+  a_score: 19.27%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 19
   size: 6
-  testscore: 16.80%
-  testscore_std: 0.33%
-  title: CodeGeeX2
+  title: chatglm3-6b
 - comment: null
-  devscore: 11.45%
-  devscore_std: 0.19%
-  link: https://huggingface.co/microsoft/phi-1
-  rank: 33
-  score: 12.84%
-  score_std: 0.73%
-  size: 1.3
-  testscore: 14.23%
-  testscore_std: 1.28%
-  title: phi-1
-settings: main
+  aps_score: 14.56%
+  as_score: 17.39%
+  a_score: 13.94%
+  link: https://platform.openai.com/docs/models/gpt-3-5
+  rank: 20
+  size: 34
+  title: codellama-34b-instruct
+# settings: main
diff --git a/_layout/mydefault.html b/_layout/mydefault.html
new file mode 100644
index 0000000..f35da65
--- /dev/null
+++ b/_layout/mydefault.html
@@ -0,0 +1,2 @@
+<!DOCTYPE html>
+{{ content }}
\ No newline at end of file
diff --git a/_site/README.md b/_site/README.md
index 56bbbd7..1eea004 100644
--- a/_site/README.md
+++ b/_site/README.md
@@ -1,6 +1,6 @@
-# DS-1000
+# InfiAgent
 
-This is the repository that contains source code for the [DS-1000 website](https://ds1000-code-gen.github.io).
+This is the repository that contains source code for the [InfiAgent](https://infi-coder.github.io/inficoder-eval/) website.
 
 ```
 
diff --git a/_site/github_token.txt b/_site/github_token.txt
new file mode 100644
index 0000000..310fced
--- /dev/null
+++ b/_site/github_token.txt
@@ -0,0 +1 @@
+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
\ No newline at end of file
diff --git a/_site/index.html b/_site/index.html
index 4d4d5c0..25fb55b 100644
--- a/_site/index.html
+++ b/_site/index.html
@@ -2,12 +2,10 @@
 
 <head>
   <meta charset="utf-8" />
-  <meta name="description" content="InfiCoder-Eval: Systematically Evaluating Question-Answering
-  for Code Large Language Models" />
-  <meta name="keywords" content="DS-1000, data-science, code-generation, codex, semantic-parsing" />
+  <meta name="description" content="InfiAgent: An Open-source Agent Framework" />
+  <meta name="keywords" content="InfiAgent-DS, code-generation, large-language-model, benchmark" />
   <meta name="viewport" content="width=device-width, initial-scale=1" />
-  <title>InfiCoder-Eval: Systematically Evaluating Question-Answering
-    for Code Large Language Models</title>
+  <title>InfiAgent: Building and Evaluating Agents on Data Analysis Tasks</title>
 
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet" />
 
@@ -32,12 +30,25 @@
   <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.11.3/css/dataTables.bootstrap5.min.css" />
   <script src="https://cdn.datatables.net/1.11.3/js/dataTables.bootstrap5.min.js"></script>
 
-  <link rel="icon" href="./static/images/inficoder_eval_logo2.png" />
+  
 
   <script defer="" src="./static/js/fontawesome.all.min.js"></script>
   <script src="./static/js/bulma-carousel.min.js"></script>
   <script src="./static/js/bulma-slider.min.js"></script>
   <script src="./static/js/index.js"></script>
+
+  <script type="text/javascript" async="" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_HTML">
+  </script>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      inlineMath: [['$', '$'], ['\\(', '\\)']],
+      displayMath: [['$$', '$$'], ['\\[', '\\]']],
+      processEscapes: true
+    }
+  });
+  </script>
+
 </head>
 
 <body>
@@ -52,7 +63,7 @@
     </div>
     <div class="navbar-menu">
       <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
-        <a class="navbar-item" href="https://github.com/infi-coder">
+        <a class="navbar-item" href="https://github.com/InfiAgent/ADA-agent/">
           <span class="icon">
             <i class="fas fa-home"></i>
           </span>
@@ -63,8 +74,8 @@
             More
           </a>
           <div class="navbar-dropdown">
-            <a class="navbar-item" href="https://github.com/infi-coder">
-              InfiCoder Organization
+            <a class="navbar-item" href="https://github.com/InfiAgent/ADA-agent/tree/main/evaluation">
+              InfiCoder-Eval
             </a>
           </div>
         </div>
@@ -75,7 +86,13 @@
 
   <div class="container">
     <div class="column has-text-centered">
-      <img style="max-width: 200px; margin-bottom: -50px;" src="static/images/inficoder_eval_logo2.png" />
+      
+    </div>
+  </div>
+
+  <div class="container">
+    <div class="column has-text-centered">
+      <img style="max-width: 200px; margin-bottom: -50px;" src="static/images/infiagent_logo.png" />
     </div>
   </div>
 
@@ -84,22 +101,21 @@
       <div class="container is-max-desktop">
         <div class="columns is-centered">
           <div class="column has-text-centered">
-            <h1 class="title is-1 publication-title">InfiCoder-Eval: Systematically Evaluating Question-Answering
-              for Code Large Language Models
+            <h1 class="title is-1 publication-title">InfiAgent-Eval: Evaluating Agents on Data Analysis Tasks
             </h1>
             <div class="is-size-5 publication-authors">
               <span class="author-block">
-                InfiCoder Team @ ByteDance Ltd. and Peking University
+                InfiAgent Team @ ByteDance Ltd. and Zhejiang University 
               </span>
               <br />
-              <span class="author-block">
-                <!-- Main Maintainer: <a href="mailto:linyi.li@bytedance.com">Linyi Li</a> -->
-                <br />
+              <!-- <span class="author-block">
+                Main Maintainer: <a href="mailto:linyi.li@bytedance.com">Linyi Li</a>
+                <br>
                 Team Lead: <a href="mailto:hx.yang@bytedance.com">Hongxia Yang</a>
-              </span>
+              </span> -->
               <!-- <span class="author-block">
                 <a href="https://xxx.github.io/">Siwei Wang</a><sup>1</sup></span>
-			        <br/>
+			        <br/> -->
             </div>
 
             <div class="is-size-5 publication-authors">
@@ -109,24 +125,16 @@ <h1 class="title is-1 publication-title">InfiCoder-Eval: Systematically Evaluati
 
             <div class="column has-text-centered">
               <div class="publication-links">
-                <!-- PDF Link. -->
-                <span class="link-block">
-                  <a href="./static/report/inficoder_eval_report_draft.pdf" class="external-link button is-normal is-rounded is-dark" target="_blank">
-                    <span class="icon">
-                      <i class="ai ai-arxiv"></i>
-                    </span>
-                    <span>Report (draft version)</span>
-                  </a>
-                </span>
+                
                 <!-- Dataset Link. -->
                 <span class="link-block">
-                  <a href="https://github.com/infi-coder/ffqa-evaluation-harness" class="external-link button is-normal is-rounded is-dark" target="_blank">
+                  <a href="https://github.com/InfiAgent/ADA-agent" class="external-link button is-normal is-rounded is-dark" target="_blank">
                     <span class="icon">
                       <i class="fab fa-github"></i>
                     </span>
-                    <span>Inference Repo</span>
+                    <span>Project Repo</span>
                   </a>
-                  <a href="https://github.com/infi-coder/inficoder-eval" class="external-link button is-normal is-rounded is-dark" target="_blank">
+                  <a href="https://github.com/InfiAgent/ADA-agent/tree/main/evaluation" class="external-link button is-normal is-rounded is-dark" target="_blank">
                     <span class="icon">
                       <i class="fab fa-github"></i>
                     </span>
@@ -156,9 +164,9 @@ <h1 class="title is-1 publication-title">InfiCoder-Eval: Systematically Evaluati
     <div class="container is-max-desktop">
       <div class="hero-body">
         <h2 class="subtitle has-text-centered">
-          InfiCoder-Eval is a systematic benchmark and evaluation framework for the free-form question-answering ability of code language models.
+          InfiAgent is a project to build and evalute agents. We start from data analysis and build a benchmark InfiAgent-Eval. 
         </h2>
-        <img src="static/images/inficoder-eval-main.png" />
+        <img src="static/images/framework.png" />
       </div>
     </div>
   </section>
@@ -172,28 +180,7 @@ <h2 class="subtitle has-text-centered">
           <h2 class="title is-3">Overview</h2>
           <div class="content has-text-justified">
             <p>
-              Large language models for code (code LLMs)
-              have made huge progress. Evaluation benchmarks
-              for code LLMs, such as <a href="https://github.com/openai/human-eval">HumanEval</a>, <a href="https://ds1000-code-gen.github.io/">DS-1000</a>,
-              and <a href="https://arxiv.org/abs/2108.07732">MBPP</a>, predominantly focus on code generation. But they are insufficient to evaluate code
-              LLMs’ multifaceted ability. To fill this gap, we
-              propose InfiCoder-Eval, a large-scale free-form
-              question-answering (QA) benchmark for code.
-              InfiCoder-Eval comprises 270 carefully picked
-              high-quality StackOverflow questions, covering
-              18 programming languages. To tackle the evaluation challenge, InfiCoder-Eval includes an evaluation framework integrating four types of model-free metrics, and domain experts design the concrete criteria for each question. As confirmed
-              with human experiments, InfiCoder-Eval evaluation aligns with humans better than model-based evaluation and runs much faster at the
-              same time. We conduct a systematic evaluation with InfiCoder-Eval for more than 30 code
-              LLMs, leading to several interesting findings. For
-              example, though open-source code LLMs show
-              competitive performance with proprietary models in code generation (e.g., HumanEval), they
-              still have a large gap compared to proprietary
-              ones in InfiCoder-Eval and even the best proprietary LLM (GPT4) is still far from perfect (best
-              open-source model Deepseek-Coder 33B Instruct
-              achieves 50.34% and GPT4 achieves 58.89%).
-              Furthermore, our detailed analysis reveals several
-              weaknesses of current code LLMs. Benchmark,
-              evaluation tools, and detailed results are all publicly available.
+              The advent of Large Language Models (LLMs) has spurred the development of LLM-augmented Autonomous Agents (LAAs). These agents are capable of generating and executing code through ongoing interactions between their core LLM and the code execution environment. In this project, we introduce Infinite Agent (InfiAgent), a LAA focused data analysis and code writing. Our agent is fine-tuned on multiple open-sourced LLMs including Llama2, chatGLM3, and Code Llama. The fine-tuning process employs a unique pipeline for Supervised Fine-Tuning (SFT) data collection, involving the creation and optimization of ReAct conversations using GPT. Furthermore, we have developed a GPT-enabled automatic evaluation benchmark question set (InfiAgent-Eval), which covers various data analysis aspects such as visualization, correlation analysis, and data transformation, providing a comprehensive means for quantitatively assessing LAAs' performance across diverse tasks. Our preliminary results suggest that Infinite Agent could significantly advance the field of autonomous code generation and execution, with potential implications in areas such as software development, data science, and automated problem-solving. This page is dedicated to elucidating the intricacies of the InfiAgent-Eval framework, encompassing aspects such as dataset construction, evaluation metrics, analytical assessment, leaderboard organization, and the procedural nuances of pipeline onboarding. 
             </p>
           </div>
         </div>
@@ -208,28 +195,42 @@ <h2 class="title is-3">Overview</h2>
       <!-- Example. -->
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
-          <h2 class="title is-3">Statistics and Examples</h2>
+          <h2 class="title is-3">Dataset Construction</h2>
           <div class="content has-text-justified">
-            <p>
-              InfiCoder-Eval comprises 270 carefully picked high-quality Stack Overflow questions, covering 18 programming languages.
-            </p>
 
-            <img src="static/images/data_domain_stats.png" />
 
-            <p>
-              We recruited five domain experts to create the benchmark and annotate the correctness evaluation criteria.
-              Specifically, the InfiCoder-Eval framework integrates four types of model-free metrics for evaluating the correctness: keywords matching, blank filling, unit testing, and dialogue similarity.
-            </p>
+          We build data analysis query and response given existing csv files.  Here is the construction pipeline. 
+          <img src="static/images/dataset_construction_eval.png" /> 
+            
+            Our dataset contains 400 questions with 72 csv files. Here're some examples of our generated questions.
+	  <img src="static/images/question_examples.png" /> 
+     	    We classify csv files used in the evaluation dataset into 9 categories based on their domains:
 
-            <img src="static/images/data_examples.png" />
+            <ul>
+              <li>Finance and Economics </li>
+              <li>Health and Medical</li>
+              <li>Demographics and Social Science</li>
+              <li>Marketing and Consumer Behavior</li>
+              <li>Energy and Environmental Monitoring</li>
+              <li>Transportation, Logistics, and Tourism</li>
+              <li>Culture, Entertainment, and Media</li>
+              <li>Scientific Research and Technology</li>
+              <li>Other Categories</li>
+            </ul>
+
+            <p>Here's the pie chart for the file categorization:</p>
+
+             <img src="static/images/domain.png" />
+
+            <p>We also check the statistics on the data analysis concepts involved by each question.</p>
+
+            <img src="static/images/concept.png" />
+
+            We manually check 50 questions generated by our methods and found:
+
+            98% questions are meaningful and only in one case, the question become meaningless becase of the wrong regulation expression matching. We also found one case that the question itself is partly unreasonable considering the csv file it based on and in the stage of answer gathering and filtering, we successfully filter it out because ADA didn't return the same answer for 3 times.
 
-            <p>
-              Below is the question type, metric type, and length statistics.
-            </p>
 
-            <center>
-              <img src="static/images/general_statistics.png" />
-            </center>
 
           </div>
         </div>
@@ -243,10 +244,29 @@ <h2 class="title is-3">Statistics and Examples</h2>
       <!-- Comparison. -->
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
-          <h2 class="title is-3">Comparison</h2>
+          <h2 class="title is-3">Metrics</h2>
           <div class="content has-text-justified">
-            <p>Existing benchmarks weigh heavily on code generation, unit-test-based evaluation, and a limited set of programming languages. InfiCoder-Eval processes a much higher diversity to reflect real-world code LLMs’ usage scenarios and is far from saturation.</p>
-            <img src="static/images/comparison.png" />
+
+            <!-- <img src="static/images/leaderboard.jpeg"> -->
+            
+            For closed-form questions, we have following metrics:
+
+Proportional Accuracy by Subquestions (PASQ):
+$$
+\text{PSAQ} = \frac{1}{N} \sum_{i=1}^{N} \left( \frac{1}{M_i} \sum_{j=1}^{M_i} I_{ij} \right)
+$$
+Here, $N$ is the total number of questions, $M_i$ is the number of subquestions for the i-th question, and $I_{ij}$ is the indicator function for the j-th subquestion of the i-th question.
+Accuracy by Questions (ABQ):
+$$
+\text{ABQ} = \frac{1}{N} \sum_{i=1}^{N} \left( \prod_{j=1}^{M_i} I_{ij} \right)
+$$
+In this expression, the product 
+$\prod_{j=1}^{M_i} I_{ij}$ equals 1 if all subquestions of the \(i\)-th question are answered correctly, and 0 otherwise.
+Uniform Accuracy by Subquestions (UASQ):
+$$
+\text{UASQ} = \frac{1}{\sum_{i=1}^{N} M_i} \sum_{i=1}^{N} \sum_{j=1}^{M_i} I_{ij}
+$$
+Here, the total accuracy is the sum of the values of the indicator function across all subquestions, normalized by the total number of subquestions in the dataset.
           </div>
         </div>
       </div>
@@ -254,49 +274,21 @@ <h2 class="title is-3">Comparison</h2>
     </div>
   </section>
 
-
   <section class="section">
     <div class="container is-max-desktop">
-      <!-- Perturbation and Prompt. -->
+      <!-- Comparison. -->
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
-          <h2 class="title is-3">Prompts and Evaluation Protocol</h2>
+          <h2 class="title is-3">Evaluation</h2>
           <div class="content has-text-justified">
-            Each question contains a system prompt and content prompt.
-            For questions whose responses are mainly in natural language, the system prompt is
-            <div class="highlighter-rouge">
-              <div class="highlight">
-                <code>You are a professional assistant for programmers. By default, questions and answers are in Markdown format. You are chatting with programmers, so please answer as briefly as possible.
-                </code>
-              </div>
-            </div>
-            For other questions, the system prompt is
-            <div class="highlighter-rouge">
-              <div class="highlight">
-                <code>You are a professional assistant for programmers. By default, questions and answers are in Markdown format.
-                </code>
-              </div>
-            </div>
-            We then format the system prompt and content prompt following each model's default instruction template.
-            If no instruction template specified, we use the prompt format 
-            <div class="highlight">
-              <code>{system prompt}\n{content prompt}
-              </code>
-            </div>
-            <p>We adopt <b>best@10</b> as the main evaluation metric, where 10 responses are sampled and evaluated for each question and the best score per question is recorded and summed up.
-              Throughout the evaluation, we set <b>sampling temperature T to be 0.2 and top p cut-off threshold to be 0.9</b>.
-              We leave the exploration of other hyperparameters as the future work.
-            </p>
-            <p>For score computation, we treat each question equally with one point each.
-              Given 270 questions in the benchmark, the full score is 270, and we by default report the percentage score (achieved score divided by the full score which is 270).
-              The one point for each question can be further decomposed into a few scoring points within each question.
-              For example, a question may contain four keywords with weights 2, 1, 1, and 1 each.
-              Then, matching each keyword can contribute to 0.4, 0.2, 0.2, and 0.2 points respectively to the final score.
-            </p>
+
+            
+            <p>For closed-form questions, we prompt LLM with question description and constraints at first time. Considering that most models hardly follow the format requirements, we add a reformat step after the models respond with gpt-3.5-turbo-16k which formats the responses with the format requirements. Here's a figure illustrating this process: </p>
+		        <img src="static/images/case-study-eval-data.png" />
           </div>
         </div>
       </div>
-      <!--/ Perturbation and Prompt. -->
+      <!--/ Comparison. -->
     </div>
   </section>
 
@@ -305,20 +297,26 @@ <h2 class="title is-3">Prompts and Evaluation Protocol</h2>
         <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
           <h2 class="title is-3">Leaderboard</h2>
+
+          <div class="content has-text-justified">
+          <p>In this section, we furnish a comprehensive evaluation of both close-source LLMs such as GPT-4 and GPT-3.5, as well as widely-utilized open-source LLMs. It is observed that the close-source LLMs demonstrate a proficient capacity to adhere to agent directives and produce logically coherent responses.
+          <span style="color: red;"> Regrettably, the open-source LLMs enumerated below are currently incapable of adhering to agent directives or delivering substantively meaningful responses. In response to this limitation, we have developed a Supervised Fine-Tuning (SFT) dataset aimed at refining these models. For these open-source LLMs, the table below delineates the enhancements observed post-finetuning. Please anticipate the imminent release of both the SFT dataset and its comprehensive details. </span></p> 
+           <img src="static/images/spider.png" />
+           </div>
         </div>
       </div>
     </div>
-    <br />
-    <div class="container is-max-desktop has-text-justified">
+    <!-- <br> -->
+    <!-- <div class="container is-max-desktop has-text-justified">
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
           <div>
-            <img src="static/images/all_results.png" />
+            <img src="static/images/all_results.png">
           </div>
           <p>Each blue point corresponds to one open-source model, with error bars for those smaller than 30B parameters. Proprietary models are plotted as lines with uncertainty ranges.</p>
         </div>
       </div>
-    </div>
+    </div> -->
     <div class="cover" id="contentCover">
       <!-- Baseline. -->
       <div class="container-t">
@@ -327,18 +325,22 @@ <h2 class="title is-3">Leaderboard</h2>
             <div class="infoCard">
               <div class="infoBody">
                 <p align="left">
-                  <div class="left"><b>Notice</b>: we set the max tokens to generate=1024 (since GPT4 generates 662 tokens without the constraint, we provide some wiggle room by setting to 1024 tokens)
+                
+               
+                  <div class="left"><b>Notice</b>: We set temperature=0.2, top_p=1.0 and frequency_penalty=0.0 for all the models.  
+                  <!-- <p style="color: red;"> Regrettably, the open-source LLMs enumerated below are currently incapable of adhering to agent directives or delivering substantively meaningful responses. In response to this limitation, we have developed a Supervised Fine-Tuning (SFT) dataset aimed at refining these models. For these open-source LLMs, the table below delineates the enhancements observed post-finetuning. Please anticipate the imminent release of both the SFT dataset and its comprehensive details. </p> -->
                   </div>
+                 
                 </p>
-                <p align="left">
+                <!-- <p align="left">
                   <div class="left">We evenly split the 270 benchmark questions to 135-question dev set and 135-question test set. Dev set is publicly available, and the test set is on held where evaluation is available upon request (see below for instructions). 
                   Models are ranked according to full set scores.
                   </div>
                 </p>
                 <p align="left">
-                  <div class="left">For models with &gt;30B parameters, we evaluate once due to resource limit, otherwise we evaluate three times and report the mean and standard deviation.
+                  <div class="left">For models with >30B parameters, we evaluate once due to resource limit, otherwise we evaluate three times and report the mean and standard deviation.
                   </div>
-                </p>
+                </p> -->
                 <br />
                 <table class="table maintable stripe hover row-border order-column" id="maintable">
                   <thead>
@@ -346,13 +348,13 @@ <h2 class="title is-3">Leaderboard</h2>
                       <th>Rank</th>
                       <th>Model Name</th>
                       <th># Params. (in B)</th>
-                      <th>Full Set Score</th>
-                      <th>Full Set Std</th>
-                      <th>Dev Set Score</th>
-                      <th>Dev Set Std</th>
+                      <th>Proportional Accuracy by Subquestions</th>
+                      <th>Accuracy by questions</th>
+                      <th>Uniform Accuracy by subquestions</th>
+                      <!-- <th>Dev Set Std</th>
                       <th>Test Set Score</th>
                       <th>Test Set Std</th>
-                      <th></th>
+                      <th></th> -->
                     </tr>
                   </thead>
                   <tbody>
@@ -363,756 +365,179 @@ <h2 class="title is-3">Leaderboard</h2>
                         <td><a href="https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo">GPT-4 (0613)</a></td>
                       
                       
-                        <td>/</td>
+                        <td class="dt-center">/</td>
                       
-                      <td>58.89%</td>
-                      
-                        <td>0.74%</td>
-                      
-                      <td>61.28%</td>
+                      <!-- <td></td> -->
+                      <td class="dt-center">65.26%</td>
+                      <td class="dt-center">66.05%</td>
+                      <td class="dt-center">59.75%</td>
+                      <!-- 
+                        <td></td>
                       
-                        <td>1.45%</td>
+                      <td></td>
                       
-                      <td>56.49%</td>
+                        <td></td>
                       
-                        <td>0.19%</td>
+                      <td></td>
                       
+                        <td></td>
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
                       <td>2</td>
                       
-                        <td><a href="https://deepseekcoder.github.io/">deepseek-coder-33b-instruct</a></td>
+                        <td><a href="https://platform.openai.com/docs/models/gpt-3-5">GPT-3.5 (turbo-0613)</a></td>
                       
                       
-                        <td>33</td>
-                      
-                      <td>50.34%</td>
+                        <td class="dt-center">/</td>
                       
+                      <!-- <td></td> -->
+                      <td class="dt-center">55.35%</td>
+                      <td class="dt-center">52.21%</td>
+                      <td class="dt-center">47.25%</td>
+                      <!-- 
                         <td></td>
                       
-                      <td>50.13%</td>
+                      <td></td>
                       
                         <td></td>
                       
-                      <td>50.55%</td>
+                      <td></td>
                       
                         <td></td>
-                      
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
                       <td>3</td>
                       
-                        <td><a href="https://platform.openai.com/docs/models/gpt-3-5">GPT-3.5-turbo (0613)</a></td>
-                      
-                      
-                        <td>/</td>
+                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-7B</a></td>
                       
-                      <td>46.70%</td>
                       
-                        <td>0.70%</td>
+                        <td class="dt-center">7</td>
                       
-                      <td>46.16%</td>
+                      <!-- <td></td> -->
+                      <td class="dt-center">47.59%</td>
+                      <td class="dt-center">44.67%</td>
+                      <td class="dt-center">39.59%</td>
+                      <!-- 
+                        <td></td>
                       
-                        <td>0.72%</td>
+                      <td></td>
                       
-                      <td>47.24%</td>
+                        <td></td>
                       
-                        <td>1.75%</td>
+                      <td></td>
                       
+                        <td></td>
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
                       <td>4</td>
                       
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0">WizardCoder-Python-34B-V1.0</a></td>
-                      
+                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-7B-python</a></td>
                       
-                        <td>34</td>
                       
-                      <td>45.16%</td>
+                        <td class="dt-center">7</td>
                       
+                      <!-- <td></td> -->
+                      <td class="dt-center">47.03%</td>
+                      <td class="dt-center">40.78%</td>
+                      <td class="dt-center">40.86%</td>
+                      <!-- 
                         <td></td>
                       
-                      <td>46.52%</td>
+                      <td></td>
                       
                         <td></td>
                       
-                      <td>43.80%</td>
+                      <td></td>
                       
                         <td></td>
-                      
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
                       <td>5</td>
                       
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-34B-Instruct</a></td>
-                      
+                        <td><a href="https://github.com/facebookresearch/llama">Llama2-7B</a></td>
                       
-                        <td>34</td>
                       
-                      <td>43.71%</td>
+                        <td class="dt-center">7</td>
                       
+                      <!-- <td></td> -->
+                      <td class="dt-center">37.53%</td>
+                      <td class="dt-center">34.01%</td>
+                      <td class="dt-center">32.49%</td>
+                      <!-- 
                         <td></td>
                       
-                      <td>44.03%</td>
+                      <td></td>
                       
                         <td></td>
                       
-                      <td>43.39%</td>
+                      <td></td>
                       
                         <td></td>
-                      
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
                       <td>6</td>
                       
-                        <td><a href="https://deepseekcoder.github.io/">deepseek-coder-6.7b-instruct</a></td>
-                      
-                      
-                        <td>6.7</td>
-                      
-                      <td>42.97%</td>
-                      
-                        <td>0.22%</td>
-                      
-                      <td>42.28%</td>
-                      
-                        <td>0.24%</td>
-                      
-                      <td>43.66%</td>
-                      
-                        <td>0.26%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>7</td>
-                      
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-Python-13B-V1.0">WizardCoder-Python-13B-V1.0</a></td>
-                      
-                      
-                        <td>13</td>
-                      
-                      <td>41.22%</td>
-                      
-                        <td>0.75%</td>
-                      
-                      <td>41.46%</td>
-                      
-                        <td>1.13%</td>
-                      
-                      <td>40.97%</td>
-                      
-                        <td>0.42%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>8</td>
-                      
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0">WizardCoder-Python-7B-V1.0</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>40.30%</td>
-                      
-                        <td>1.15%</td>
-                      
-                      <td>43.70%</td>
-                      
-                        <td>2.09%</td>
-                      
-                      <td>36.90%</td>
-                      
-                        <td>0.22%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>9</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-34B</a></td>
-                      
+                        <td><a href="https://github.com/nlpxucan/WizardLM">WizardCoder-Python-7B-V1.0</a></td>
                       
-                        <td>34</td>
                       
-                      <td>39.75%</td>
+                        <td class="dt-center">7</td>
                       
+                      <!-- <td></td> -->
+                      <td class="dt-center">33.59%</td>
+                      <td class="dt-center">30.48%</td>
+                      <td class="dt-center">28.75%</td>
+                      <!-- 
                         <td></td>
                       
-                      <td>39.71%</td>
-                      
-                        <td></td>
-                      
-                      <td>39.79%</td>
-                      
-                        <td></td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>10</td>
-                      
-                        <td><a href="https://huggingface.co/HuggingFaceH4/zephyr-7b-beta">Zypher-7b-beta</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>39.59%</td>
-                      
-                        <td>0.68%</td>
-                      
-                      <td>41.20%</td>
-                      
-                        <td>0.25%</td>
-                      
-                      <td>37.97%</td>
-                      
-                        <td>1.29%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>11</td>
-                      
-                        <td><a href="https://huggingface.co/bigcode/octocoder">OctoCoder</a></td>
-                      
-                      
-                        <td>15.5</td>
-                      
-                      <td>37.72%</td>
-                      
-                        <td>0.58%</td>
-                      
-                      <td>35.45%</td>
-                      
-                        <td>0.56%</td>
-                      
-                      <td>40.00%</td>
-                      
-                        <td>1.69%</td>
-                      
                       <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>12</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-13B-Instruct</a></td>
-                      
-                      
-                        <td>13</td>
-                      
-                      <td>37.18%</td>
-                      
-                        <td>0.51%</td>
-                      
-                      <td>36.03%</td>
-                      
-                        <td>1.37%</td>
-                      
-                      <td>38.34%</td>
-                      
-                        <td>0.36%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>13</td>
-                      
-                        <td><a href="https://huggingface.co/Qwen/Qwen-14B-Chat">Qwen-14B-Chat</a></td>
-                      
-                      
-                        <td>14</td>
-                      
-                      <td>36.93%</td>
-                      
-                        <td>0.12%</td>
-                      
-                      <td>36.19%</td>
-                      
-                        <td>0.75%</td>
-                      
-                      <td>37.68%</td>
-                      
-                        <td>0.91%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>14</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-13B</a></td>
-                      
-                      
-                        <td>13</td>
-                      
-                      <td>33.92%</td>
-                      
-                        <td>0.64%</td>
-                      
-                      <td>31.78%</td>
-                      
-                        <td>1.01%</td>
-                      
-                      <td>36.06%</td>
-                      
-                        <td>0.49%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>15</td>
-                      
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-15B-V1.0">WizardCoder-15B-V1.0</a></td>
-                      
-                      
-                        <td>15</td>
-                      
-                      <td>33.34%</td>
-                      
-                        <td>0.74%</td>
-                      
-                      <td>31.85%</td>
-                      
-                        <td>0.60%</td>
-                      
-                      <td>34.83%</td>
-                      
-                        <td>1.86%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>16</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-34B-Python</a></td>
-                      
-                      
-                        <td>34</td>
-                      
-                      <td>33.23%</td>
-                      
-                        <td></td>
-                      
-                      <td>32.10%</td>
-                      
-                        <td></td>
-                      
-                      <td>34.35%</td>
                       
                         <td></td>
                       
                       <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>17</td>
-                      
-                        <td><a href="https://huggingface.co/bigcode/octogeex">OctoGeeX</a></td>
-                      
-                      
-                        <td>6</td>
-                      
-                      <td>32.60%</td>
-                      
-                        <td>1.02%</td>
-                      
-                      <td>31.55%</td>
-                      
-                        <td>2.01%</td>
-                      
-                      <td>33.65%</td>
-                      
-                        <td>0.91%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>18</td>
-                      
-                        <td><a href="https://huggingface.co/Qwen/Qwen-7B-Chat">Qwen-7B-Chat</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>32.48%</td>
-                      
-                        <td>0.71%</td>
-                      
-                      <td>31.87%</td>
-                      
-                        <td>0.66%</td>
-                      
-                      <td>33.10%</td>
-                      
-                        <td>0.77%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>19</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-13B-Python</a></td>
-                      
-                      
-                        <td>13</td>
-                      
-                      <td>32.43%</td>
-                      
-                        <td>0.42%</td>
-                      
-                      <td>31.09%</td>
-                      
-                        <td>1.85%</td>
-                      
-                      <td>33.78%</td>
-                      
-                        <td>1.02%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>20</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-7B</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>31.07%</td>
-                      
-                        <td>0.87%</td>
-                      
-                      <td>30.05%</td>
-                      
-                        <td>1.79%</td>
-                      
-                      <td>32.09%</td>
-                      
-                        <td>0.05%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>21</td>
-                      
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-3B-V1.0">WizardCoder-3B-V1.0</a></td>
-                      
-                      
-                        <td>3</td>
-                      
-                      <td>30.94%</td>
-                      
-                        <td>0.60%</td>
-                      
-                      <td>31.33%</td>
-                      
-                        <td>0.80%</td>
-                      
-                      <td>30.56%</td>
-                      
-                        <td>0.58%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>22</td>
-                      
-                        <td><a href="https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat">Baichuan2-13B-Chat</a></td>
-                      
-                      
-                        <td>13</td>
-                      
-                      <td>30.34%</td>
-                      
-                        <td>0.76%</td>
-                      
-                      <td>30.15%</td>
-                      
-                        <td>0.73%</td>
-                      
-                      <td>30.52%</td>
-                      
-                        <td>1.32%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>23</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-7B-Instruct</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>29.51%</td>
-                      
-                        <td>0.97%</td>
-                      
-                      <td>27.96%</td>
-                      
-                        <td>0.81%</td>
-                      
-                      <td>31.06%</td>
-                      
-                        <td>1.18%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>24</td>
-                      
-                        <td><a href="https://github.com/facebookresearch/codellama">CodeLlama-7B-Python</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>28.88%</td>
-                      
-                        <td>0.45%</td>
-                      
-                      <td>27.51%</td>
-                      
-                        <td>1.09%</td>
-                      
-                      <td>30.24%</td>
-                      
-                        <td>1.97%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>25</td>
-                      
-                        <td><a href="https://huggingface.co/WizardLM/WizardCoder-1B-V1.0">WizardCoder-1B-V1.0</a></td>
-                      
-                      
-                        <td>1</td>
-                      
-                      <td>27.11%</td>
-                      
-                        <td>0.85%</td>
-                      
-                      <td>26.53%</td>
-                      
-                        <td>1.12%</td>
-                      
-                      <td>27.70%</td>
-                      
-                        <td>0.72%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>26</td>
-                      
-                        <td><a href="https://huggingface.co/bigcode/starcoder">StarCoder</a></td>
-                      
-                      
-                        <td>15.5</td>
-                      
-                      <td>26.79%</td>
-                      
-                        <td>0.18%</td>
-                      
-                      <td>28.74%</td>
-                      
-                        <td>0.78%</td>
-                      
-                      <td>24.84%</td>
-                      
-                        <td>0.96%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>27</td>
-                      
-                        <td><a href="https://huggingface.co/bigcode/starcoderplus">StarCoderPlus</a></td>
-                      
-                      
-                        <td>15.5</td>
-                      
-                      <td>26.07%</td>
-                      
-                        <td>1.25%</td>
-                      
-                      <td>24.45%</td>
-                      
-                        <td>1.55%</td>
-                      
-                      <td>27.69%</td>
-                      
-                        <td>2.13%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>28</td>
-                      
-                        <td><a href="https://huggingface.co/Salesforce/codegen25-7b-instruct">CodeGen2.5-7B-instruct</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>25.67%</td>
-                      
-                        <td>1.57%</td>
-                      
-                      <td>23.36%</td>
-                      
-                        <td>1.77%</td>
-                      
-                      <td>27.98%</td>
-                      
-                        <td>1.46%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>29</td>
-                      
-                        <td><a href="https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat">Baichuan2-7B-Chat</a></td>
-                      
-                      
-                        <td>7</td>
-                      
-                      <td>24.39%</td>
-                      
-                        <td>0.25%</td>
-                      
-                      <td>25.44%</td>
-                      
-                        <td>0.02%</td>
-                      
-                      <td>23.34%</td>
-                      
-                        <td>0.50%</td>
-                      
-                      <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>30</td>
-                      
-                        <td><a href="https://platform.openai.com/docs/models/gpt-base">davinci-002</a></td>
-                      
-                      
-                        <td>/</td>
-                      
-                      <td>19.08%</td>
-                      
-                        <td>1.00%</td>
-                      
-                      <td>17.47%</td>
-                      
-                        <td>1.56%</td>
-                      
-                      <td>20.70%</td>
-                      
-                        <td>0.70%</td>
                       
+                        <td></td>
+                       -->
                       <td></td>
                     </tr>
                     
                     <tr>
-                      <td>31</td>
-                      
-                        <td><a href="https://huggingface.co/microsoft/phi-1_5">phi-1.5</a></td>
-                      
-                      
-                        <td>1.5</td>
-                      
-                      <td>16.63%</td>
-                      
-                        <td>0.03%</td>
+                      <td>7</td>
                       
-                      <td>14.10%</td>
+                        <td><a href="https://github.com/THUDM/ChatGLM3">ChatGLM3</a></td>
                       
-                        <td>0.84%</td>
                       
-                      <td>19.16%</td>
+                        <td class="dt-center">6</td>
                       
-                        <td>0.87%</td>
+                      <!-- <td></td> -->
+                      <td class="dt-center">18.10%</td>
+                      <td class="dt-center">15.19%</td>
+                      <td class="dt-center">14.84%</td>
+                      <!-- 
+                        <td></td>
                       
                       <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>32</td>
-                      
-                        <td><a href="https://github.com/THUDM/CodeGeeX2">CodeGeeX2</a></td>
-                      
-                      
-                        <td>6</td>
                       
-                      <td>16.50%</td>
-                      
-                        <td>0.39%</td>
-                      
-                      <td>16.20%</td>
-                      
-                        <td>0.65%</td>
-                      
-                      <td>16.80%</td>
-                      
-                        <td>0.33%</td>
+                        <td></td>
                       
                       <td></td>
-                    </tr>
-                    
-                    <tr>
-                      <td>33</td>
-                      
-                        <td><a href="https://huggingface.co/microsoft/phi-1">phi-1</a></td>
-                      
-                      
-                        <td>1.3</td>
-                      
-                      <td>12.84%</td>
-                      
-                        <td>0.73%</td>
-                      
-                      <td>11.45%</td>
-                      
-                        <td>0.19%</td>
-                      
-                      <td>14.23%</td>
-                      
-                        <td>1.28%</td>
                       
+                        <td></td>
+                       -->
                       <td></td>
                     </tr>
                     
@@ -1126,62 +551,64 @@ <h2 class="title is-3">Leaderboard</h2>
       </div>
     </div>
   </section>
-
+  
   <section class="section">
     <div class="container is-max-desktop">
       <!-- Benchmarking Tutorial -->
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
-          <h2 class="title is-3">Try the Benchmark!</h2>
+          <h2 class="title is-3">Pipeline Onboarding</h2>
           <div class="content has-text-justified">
-            <h3>Step 0: Setup</h3>
+            <h3>Setup</h3>
             <ol>
-              <li>Convert or save your model weights in Hugging Face Transformers format.</li>
-              <li>Clone the two repositories: <a href="https://github.com/infi-coder/ffqa-evaluation-harness">Inference Repo</a> and <a href="https://github.com/infi-coder/inficoder-eval">Evaluation Repo</a>.</li>
-              <p>No requirement on the local directory paths.</p>
-              <li>Set global environment variable: </li>
-              <div class="highlight"><code>export INFERENCE_REPO_PATH=[evaluation repo]/batched_prompts/suite_v2.0.0_dev.csv</code></div>
+		<li>We have to initialize the environment using the following code:</li>
+              <pre><code>conda create -n agent python==3.9.12
+pip3 install -r requirements.txt</code></pre>
+		    <li>We also build a Python code sandbox in our pipeline based on docker, you can use the following code to build your docker image:</li>
+		    <pre><code>docker build -t myimg .</code></pre>
             </ol>
             <br />
-            <h3>Step 1: Generate Response for Your Model</h3>
+            <h3>Example 1: Demo Usage</h3>
             <ol>
-              <li>Set the working directory to <a href="https://github.com/infi-coder/ffqa-evaluation-harness">Inference Repo</a>.</li>
-              <p>The inference repo is forked and slightly modified from <a href="https://github.com/bigcode-project/bigcode-evaluation-harness">bigcode-evaluation-harness</a> framework. We leverage its function for inference.</p> 
-              <li>Determine the prompt format to use, which corresponds to task name.</li>
-              <p>We support these format for now: <code>code-ffqa-v2</code> (the default one, system + '\n' + content), <code>code-ffqa-v2-endn</code> (system + '\n' + content + '\n'), <code>code-ffqa-v2-deepseek-chat</code> (deepseek-coder-instruct format), <code>code-ffqa-v2-baichuan2</code> (baichuan2 models format), <code>code-ffqa-v2-zypher</code> (zypher-7b-beta format), <code>code-ffqa-v2-octo</code> (octopack model format), <code>code-ffqa-v2-wizard</code> (wizard-python model format), <code>code-ffqa-v2-phi</code> (phi-1.5 model format), and <code>code-ffqa-v2-inficoder</code> (our InfiCoder model format).</p>
-              <p>Feel free to contribute by adding your model format, which is easy - just slightly modify <code>bigcode_eval/tasks/code_ffqa_v200.py</code> a bit.</p> 
-              <li>Run batch inference to generate responses for question prompts.</li>
-              <div class="highlight"><code>accelerate launch [inference repo dir]/main.py --model [your model path / hugging face hub path] --tasks [determined task name above] --batch_size [batch_size] --precision bf16 --n_samples 30 --do_sample True --temperature 0.2 --top_p 0.9 --save_generations --save_references --trust_remote_code --generation_only --max_new_tokens 1024 --save_generations_path [output raw response file path].json --eos='[EOS string]'</code></div>
-              <p>This command will output two files in your working directory: <code>[output raw response file path].json</code> which stores responses and <code>references.json</code> which stores case names as the index.</p>
-              <li>Export responses and case names to evaluation-capatible csv file.</li>
-              <div class="highlight"><code>python3 [inference repo dir]/ffqa_processor.py [output raw response file path].json references.json [response csv file].csv</code></div>
-              <p>This command will join the two output files above into one csv file <code>[response csv file].csv</code> which can be processed by the evaluation framework below.</p>
+              <li>You can easily use the following command to start a demo using APIs.</li>
+		    <pre><code># initialize poetry
+poetry init
+# Supported LLM: OPEN_AI, AZURE_OPEN_AI
+# api_key is required for API-based models
+bash run_demo.sh --llm AZURE_OPEN_AI --api_key 123</code></pre>
+              <li>After running the above code, an interactive frontend interface will be displayed.</li> 
+		    <img src="static/images/demo_fig.png" />
+              <li>You can enter prompts in the dialogue box, and if you need to upload a file, you can select the file for upload in the "browse files" section.</li>
+              <li>Click the "run code interpreter" button, and the backend will execute our pipeline. The agent will generate code and execute it in the sandbox, and the results will be returned on the interactive page upon completion.</li>
             </ol>
             <br />
-            <h3>Step 2: Evaluation (Dev Set)</h3>
+            <h3>Example 2: Running With Local Models</h3>
             <ol>
-              <li>Setup the evaluation framework: <a href="https://github.com/infi-coder/inficoder-eval">Evaluation Repo</a>.</li>
+              <li>Our local LLM service is developed on vLLM. First, you can start a vLLM model serving by running this command:</li>
+		    <pre><code># Take llama-2-7b as an example
+python3 src/activities/vllm_api_server.py --model "meta-llama/Llama-2-7b-hf"  --served_model_name "meta-llama/Llama-2-7b-hf"</code></pre>
               <p>At this point, we only support Linux environment.</p>
-              <p>Run <code>pip3 install -r requirements.txt</code>, then <code>./setup.sh</code> (time costly, usually 1-2 hours) which installs necessary compilers and packages for multi-lingual execution environment.</p>
-              <li>Check the evaluation environment.</li>
-              <p>Run <code>python3 env_check.py</code> to check and fix the environment incompatibility according to the console output. If the console output is "You're good to go.", then we can proceed.</p>
-              <li>Unpack the csv output.</li>
-              <p>Unpack the csv output file from the previous inference step into a directory where each response is stored in a separate txt.</p>
-              <div class="highlight"><code>python3 adaptors/csv_response_unpacker.py [response csv file].csv [response save dir]</code></div>
-              <p>We recommend to save the responses in a directory in <code>responses/</code>, i.e., let <code>[response save dir]=responses/...</code>. The above script will create the <code>[response save dir]</code> directory if it does not exist.</p>
-              <li>Run main evaluation.</li>
-              <div class="highlight"><code>python3 grader_main.py suite_v2.0.0_dev.yaml [response save dir]</code></div>
-              <p>The evaluation takes around 15 min - 45 min.</p>
-              <p>When it finishes, there are two output files: <code>results/suite_v2.0.0_dev_[response save dir base name].txt</code> (short summary) and <code>results/suite_v2.0.0_dev_[response save dir base name].yaml</code> (all details).</p>
-              <p>You can also customized the output paths by <code>--result_summary_path</code> and <code>--result_detail_path</code> arguments respectively.</p>
-              <li>Get statistics and print the results.</li>
-              <div class="highlight"><code>python3 print_result_stat.py [result detail path] [summary txt path]</code></div>
-              <p>In console output and <code>[summary txt path]</code>, a nice table will be printed, including the overall score and percentage and the sub-scores for each question type, metric type, and programming language.</p>
+              <li>You can try this command if the serving is successfully starting:</li>
+	      <pre><code>curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "meta-llama/Llama-2-7b-hf",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'</code></pre>
+              <li>Then you can run the demo following this command:</li>
+	      <pre><code>bash run_demo.sh --llm "meta-llama/Llama-2-7b-hf"</code></pre>
             </ol>
-            <h3>Step 3: Evaluate (Test Set)</h3>
-            <p>
-              Available upon request (<a href="mailto:linyi.li@bytedance.com">email us</a>).
-            </p>
+            <br />
+            <h3>Example 3: Running Without Front-end</h3>
+	<ol>
+		<p>Our demo is designed to default to the use of the front-end. If you prefer not to use the front-end and instead perform command-line operations or process large amounts of data, you can refer to the following commands:</p>
+		<pre><code># Run with API.
+python3 ./src/activities/eval.py --llm AZURE_OPEN_AI --api_key 123
+# Run with local model.Take llama-2-7b as an example.
+python3 ./src/activities/eval.py --llm "meta-llama/Llama-2-7b-hf"</code></pre>
+	</ol>
           </div>
         </div>
       </div>
@@ -1203,11 +630,11 @@ <h2 class="title">Acknowledgement</h2>
       <div class="bibtex-body">
         <h2 class="title">BibTeX</h2>
         <pre><code>@misc{li2023inficodereval,
-  author = {InfiCoderTeam},
-  title = {InfiCoder-Eval: Systematically Evaluating Question-Answering for Code Large Language Models},
+  author = {InfiAgent Team},
+  title = {InfiAgent: Building and Evaluating Agents on Data Analysis},
   year = {2023},
   publisher = {Github Pages},
-  howpublished = "\url{https://github.com/infi-coder/inficoder-eval}"
+  howpublished = "\url{https://infiagent.github.io/}"
 }</code></pre>
       </div>
     </div>
@@ -1217,12 +644,12 @@ <h2 class="title">BibTeX</h2>
   <footer class="footer">
     <div class="container">
       <div class="content has-text-centered">
-        <!-- <a class="icon-link" href="https://arxiv.org/pdf/2211.11501">
+        <!-- <a class="icon-link" href="./static/report/inficoder_eval_report_draft.pdf">
           <i class="fas fa-file-pdf" style="color:white"></i>
-        </a> -->
-        <a class="external-link" href="https://github.com/infi-coder" disabled="">
-          <i class="fab fa-github" style="color:white"></i>
         </a>
+        <a class="icon-link" href="https://github.com/infi-coder" class="external-link" disabled>
+          <i class="fab fa-github" style="color:white"></i>
+        </a> -->
       </div>
       <div class="columns is-centered">
         <div class="column is-8">
@@ -1232,7 +659,7 @@ <h2 class="title">BibTeX</h2>
                 Commons Attribution-ShareAlike 4.0 International License</a>.
             </p>
             <p>
-              This means you are free to borrow the <a href="https://github.com/infi-coder/inficoder-eval.github.io">source
+              This means you are free to borrow the <a href="https://github.com/infi-coder/inficoder-eval">source
                 code</a> of this website,
               we just ask that you link back to this page in the footer.
             </p>
@@ -1244,7 +671,7 @@ <h2 class="title">BibTeX</h2>
   
   <script>
     $(document).ready( function () {
-      $('.mainTable').DataTable({ordering: true, order: [[3, 'desc']], columns: [{ "type": "num" },{ "type": "html" },{ "type": "num" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "html", "orderable": false }]});
+      $('.mainTable').DataTable({ordering: true, order: [[3, 'desc']], columns: [{ "type": "num" },{ "type": "html" },{ "type": "num" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" }]});
     } );
   </script>
 
diff --git a/_site/static/images/InfiAgent.png b/_site/static/images/InfiAgent.png
new file mode 100644
index 0000000..7cbab5f
Binary files /dev/null and b/_site/static/images/InfiAgent.png differ
diff --git a/_site/static/images/all_results.png b/_site/static/images/all_results.png
index d000c7c..ae53039 100644
Binary files a/_site/static/images/all_results.png and b/_site/static/images/all_results.png differ
diff --git a/_site/static/images/case-study-eval-data.png b/_site/static/images/case-study-eval-data.png
new file mode 100644
index 0000000..0037ed5
Binary files /dev/null and b/_site/static/images/case-study-eval-data.png differ
diff --git a/_site/static/images/demo_fig.png b/_site/static/images/demo_fig.png
new file mode 100644
index 0000000..0d01d03
Binary files /dev/null and b/_site/static/images/demo_fig.png differ
diff --git a/_site/static/images/inficoder-eval-main.png b/_site/static/images/inficoder-eval-main.png
index f4d02eb..b56ae46 100644
Binary files a/_site/static/images/inficoder-eval-main.png and b/_site/static/images/inficoder-eval-main.png differ
diff --git a/_site/static/images/inficoder_eval_logo.png b/_site/static/images/inficoder_eval_logo.png
deleted file mode 100644
index 509db99..0000000
Binary files a/_site/static/images/inficoder_eval_logo.png and /dev/null differ
diff --git a/_site/static/js/index.js b/_site/static/js/index.js
index 21ba87f..0ca2235 100644
--- a/_site/static/js/index.js
+++ b/_site/static/js/index.js
@@ -58,20 +58,20 @@ $(document).ready(function() {
     	});
     }
 
-    // /*var player = document.getElementById('interpolation-video');
-    // player.addEventListener('loadedmetadata', function() {
-    //   $('#interpolation-slider').on('input', function(event) {
-    //     console.log(this.value, player.duration);
-    //     player.currentTime = player.duration / 100 * this.value;
-    //   })
-    // }, false);*/
-    // preloadInterpolationImages();
+    /*var player = document.getElementById('interpolation-video');
+    player.addEventListener('loadedmetadata', function() {
+      $('#interpolation-slider').on('input', function(event) {
+        console.log(this.value, player.duration);
+        player.currentTime = player.duration / 100 * this.value;
+      })
+    }, false);*/
+    preloadInterpolationImages();
 
-    // $('#interpolation-slider').on('input', function(event) {
-    //   setInterpolationImage(this.value);
-    // });
-    // setInterpolationImage(0);
-    // $('#interpolation-slider').prop('max', NUM_INTERP_FRAMES - 1);
+    $('#interpolation-slider').on('input', function(event) {
+      setInterpolationImage(this.value);
+    });
+    setInterpolationImage(0);
+    $('#interpolation-slider').prop('max', NUM_INTERP_FRAMES - 1);
 
     bulmaSlider.attach();
 
diff --git a/_site/static/report/inficoder_eval_report_draft.pdf b/_site/static/report/inficoder_eval_report_draft.pdf
index 0fb600c..2344bee 100644
Binary files a/_site/static/report/inficoder_eval_report_draft.pdf and b/_site/static/report/inficoder_eval_report_draft.pdf differ
diff --git a/github_token.txt b/github_token.txt
new file mode 100644
index 0000000..310fced
--- /dev/null
+++ b/github_token.txt
@@ -0,0 +1 @@
+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
\ No newline at end of file
diff --git a/index.markdown b/index.markdown
index 3e157d1..4773528 100644
--- a/index.markdown
+++ b/index.markdown
@@ -1,17 +1,15 @@
 ---
-layout: default
+layout: mydefault
 ---
 
 <html>
 
 <head>
   <meta charset="utf-8">
-  <meta name="description" content="InfiCoder-Eval: Systematically Evaluating Question-Answering
-  for Code Large Language Models">
-  <meta name="keywords" content="DS-1000, data-science, code-generation, codex, semantic-parsing">
+  <meta name="description" content="InfiAgent-DABench: Evaluating Agents on Data Analysis Tasks">
+  <meta name="keywords" content="InfiAgent-DS, code-generation, large-language-model, benchmark">
   <meta name="viewport" content="width=device-width, initial-scale=1">
-  <title>InfiCoder-Eval: Systematically Evaluating Question-Answering
-    for Code Large Language Models</title>
+  <title>InfiAgent-DABench: Evaluating Agents on Data Analysis Tasks</title>
 
   <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
 
@@ -36,12 +34,26 @@ layout: default
   <link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.11.3/css/dataTables.bootstrap5.min.css">
   <script src="https://cdn.datatables.net/1.11.3/js/dataTables.bootstrap5.min.js"></script>
 
-  <link rel="icon" href="./static/images/inficoder_eval_logo2.png">
+  
 
   <script defer src="./static/js/fontawesome.all.min.js"></script>
   <script src="./static/js/bulma-carousel.min.js"></script>
   <script src="./static/js/bulma-slider.min.js"></script>
   <script src="./static/js/index.js"></script>
+
+  <script type="text/javascript" async
+  src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_HTML">
+  </script>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      inlineMath: [['$', '$'], ['\\(', '\\)']],
+      displayMath: [['$$', '$$'], ['\\[', '\\]']],
+      processEscapes: true
+    }
+  });
+  </script>
+
 </head>
 
 <body>
@@ -56,7 +68,7 @@ layout: default
     </div>
     <div class="navbar-menu">
       <div class="navbar-start" style="flex-grow: 1; justify-content: center;">
-        <a class="navbar-item" href="https://github.com/infi-coder">
+        <a class="navbar-item" href="hhttps://github.com/InfiAgent/InfiAgent/tree/main">
           <span class="icon">
             <i class="fas fa-home"></i>
           </span>
@@ -67,8 +79,8 @@ layout: default
             More
           </a>
           <div class="navbar-dropdown">
-            <a class="navbar-item" href="https://github.com/infi-coder">
-              InfiCoder Organization
+            <a class="navbar-item" href="https://github.com/InfiAgent/InfiAgent/tree/main/examples/DA-Agent">
+              InfiAgent-DABench
             </a>
           </div>
         </div>
@@ -79,7 +91,13 @@ layout: default
 
   <div class="container">
     <div class="column has-text-centered">
-      <img style="max-width: 200px; margin-bottom: -50px;" src="static/images/inficoder_eval_logo2.png">
+      
+    </div>
+  </div>
+
+  <div class="container">
+    <div class="column has-text-centered">
+      <img style="max-width: 200px; margin-bottom: -50px;" src="static/images/infiagent_logo.png">
     </div>
   </div>
 
@@ -88,22 +106,21 @@ layout: default
       <div class="container is-max-desktop">
         <div class="columns is-centered">
           <div class="column has-text-centered">
-            <h1 class="title is-1 publication-title">InfiCoder-Eval: Systematically Evaluating Question-Answering
-              for Code Large Language Models
+            <h1 class="title is-1 publication-title">InfiAgent-DABench: Evaluating Agents on Data Analysis Tasks
             </h1>
             <div class="is-size-5 publication-authors">
               <span class="author-block">
-                InfiCoder Team @ ByteDance Ltd. and Peking University
+                InfiAgent Team 
               </span>
               <br>
-              <span class="author-block">
-                <!-- Main Maintainer: <a href="mailto:linyi.li@bytedance.com">Linyi Li</a> -->
+              <!-- <span class="author-block">
+                Main Maintainer: <a href="mailto:linyi.li@bytedance.com">Linyi Li</a>
                 <br>
                 Team Lead: <a href="mailto:hx.yang@bytedance.com">Hongxia Yang</a>
-              </span>
+              </span> -->
               <!-- <span class="author-block">
                 <a href="https://xxx.github.io/">Siwei Wang</a><sup>1</sup></span>
-			        <br/>
+			        <br/> -->
             </div>
 
             <div class="is-size-5 publication-authors">
@@ -113,26 +130,27 @@ layout: default
 
             <div class="column has-text-centered">
               <div class="publication-links">
-                <!-- PDF Link. -->
-                <span class="link-block">
-                  <a href="./static/report/inficoder_eval_report_draft.pdf"
-                    class="external-link button is-normal is-rounded is-dark" target='_blank'>
-                    <span class="icon">
-                      <i class="ai ai-arxiv"></i>
-                    </span>
-                    <span>Report (draft version)</span>
-                  </a>
-                </span>
+	      <!-- PDF Link. -->
+	      <span class="link-block">
+	        <a href="https://arxiv.org/abs/2401.05507"
+		  class="external-link button is-normal is-rounded is-dark" target='_blank'>
+		  <span class="icon">
+		    <i class="ai ai-arxiv"></i>
+		  </span>
+		  <span>Paper</span>
+	        </a>
+	      </span>
+                
                 <!-- Dataset Link. -->
                 <span class="link-block">
-                  <a href="https://github.com/infi-coder/ffqa-evaluation-harness"
+                  <a href="https://github.com/InfiAgent/InfiAgent/tree/main"
                      class="external-link button is-normal is-rounded is-dark" target='_blank'>
                     <span class="icon">
                       <i class="fab fa-github"></i>
                     </span>
-                    <span>Inference Repo</span>
+                    <span>Project Repo</span>
                   </a>
-                  <a href="https://github.com/infi-coder/inficoder-eval"
+                  <a href="https://github.com/InfiAgent/InfiAgent/tree/main/examples/DA-Agent"
                      class="external-link button is-normal is-rounded is-dark" target='_blank'>
                     <span class="icon">
                       <i class="fab fa-github"></i>
@@ -163,160 +181,30 @@ layout: default
     <div class="container is-max-desktop">
       <div class="hero-body">
         <h2 class="subtitle has-text-centered">
-          InfiCoder-Eval is a systematic benchmark and evaluation framework for the free-form question-answering ability of code language models.
+          InfiAgent-DABench is a project to build and evalute agents for advanced data analysis. Agent evaluation has been an open and challenging problem. 
         </h2>
-        <img src="static/images/inficoder-eval-main.png">
-      </div>
-    </div>
-  </section>
-
-
-  <section class="section">
-    <div class="container is-max-desktop">
-      <!-- Abstract. -->
-      <div class="columns is-centered has-text-centered">
-        <div class="column is-four-fifths">
-          <h2 class="title is-3">Overview</h2>
-          <div class="content has-text-justified">
-            <p>
-              Large language models for code (code LLMs)
-              have made huge progress. Evaluation benchmarks
-              for code LLMs, such as <a href="https://github.com/openai/human-eval">HumanEval</a>, <a href="https://ds1000-code-gen.github.io/">DS-1000</a>,
-              and <a href="https://arxiv.org/abs/2108.07732">MBPP</a>, predominantly focus on code generation. But they are insufficient to evaluate code
-              LLMs’ multifaceted ability. To fill this gap, we
-              propose InfiCoder-Eval, a large-scale free-form
-              question-answering (QA) benchmark for code.
-              InfiCoder-Eval comprises 270 carefully picked
-              high-quality StackOverflow questions, covering
-              18 programming languages. To tackle the evaluation challenge, InfiCoder-Eval includes an evaluation framework integrating four types of model-free metrics, and domain experts design the concrete criteria for each question. As confirmed
-              with human experiments, InfiCoder-Eval evaluation aligns with humans better than model-based evaluation and runs much faster at the
-              same time. We conduct a systematic evaluation with InfiCoder-Eval for more than 30 code
-              LLMs, leading to several interesting findings. For
-              example, though open-source code LLMs show
-              competitive performance with proprietary models in code generation (e.g., HumanEval), they
-              still have a large gap compared to proprietary
-              ones in InfiCoder-Eval and even the best proprietary LLM (GPT4) is still far from perfect (best
-              open-source model Deepseek-Coder 33B Instruct
-              achieves 50.34% and GPT4 achieves 58.89%).
-              Furthermore, our detailed analysis reveals several
-              weaknesses of current code LLMs. Benchmark,
-              evaluation tools, and detailed results are all publicly available.
-            </p>
-          </div>
-        </div>
+        <img src="static/images/framework.png">
       </div>
-      <!--/ Abstract. -->
     </div>
   </section>
 
 
-  <section class="section">
-    <div class="container is-max-desktop">
-      <!-- Example. -->
-      <div class="columns is-centered has-text-centered">
-        <div class="column is-four-fifths">
-          <h2 class="title is-3">Statistics and Examples</h2>
-          <div class="content has-text-justified">
-            <p>
-              InfiCoder-Eval comprises 270 carefully picked high-quality Stack Overflow questions, covering 18 programming languages.
-            </p>
-
-            <img src="static/images/data_domain_stats.png">
-
-            <p>
-              We recruited five domain experts to create the benchmark and annotate the correctness evaluation criteria.
-              Specifically, the InfiCoder-Eval framework integrates four types of model-free metrics for evaluating the correctness: keywords matching, blank filling, unit testing, and dialogue similarity.
-            </p>
-
-            <img src="static/images/data_examples.png">
-
-            <p>
-              Below is the question type, metric type, and length statistics.
-            </p>
-
-            <center>
-              <img src="static/images/general_statistics.png">
-            </center>
-
-          </div>
-        </div>
-      </div>
-      <!--/ Example. -->
-    </div>
-  </section>
-
-  <section class="section">
-    <div class="container is-max-desktop">
-      <!-- Comparison. -->
-      <div class="columns is-centered has-text-centered">
-        <div class="column is-four-fifths">
-          <h2 class="title is-3">Comparison</h2>
-          <div class="content has-text-justified">
-            <p>Existing benchmarks weigh heavily on code generation, unit-test-based evaluation, and a limited set of programming languages. InfiCoder-Eval processes a much higher diversity to reflect real-world code LLMs’ usage scenarios and is far from saturation.</p>
-            <img src="static/images/comparison.png">
-          </div>
-        </div>
-      </div>
-      <!--/ Comparison. -->
-    </div>
-  </section>
-
-
-  <section class="section">
-    <div class="container is-max-desktop">
-      <!-- Perturbation and Prompt. -->
-      <div class="columns is-centered has-text-centered">
-        <div class="column is-four-fifths">
-          <h2 class="title is-3">Prompts and Evaluation Protocol</h2>
-          <div class="content has-text-justified">
-            Each question contains a system prompt and content prompt.
-            For questions whose responses are mainly in natural language, the system prompt is
-            <div class="highlighter-rouge">
-              <div class="highlight">
-                <code>You are a professional assistant for programmers. By default, questions and answers are in Markdown format. You are chatting with programmers, so please answer as briefly as possible.
-                </code>
-              </div>
-            </div>
-            For other questions, the system prompt is
-            <div class="highlighter-rouge">
-              <div class="highlight">
-                <code>You are a professional assistant for programmers. By default, questions and answers are in Markdown format.
-                </code>
-              </div>
-            </div>
-            We then format the system prompt and content prompt following each model's default instruction template.
-            If no instruction template specified, we use the prompt format 
-            <div class="highlight">
-              <code>{system prompt}\n{content prompt}
-              </code>
-            </div>
-            <p>We adopt <b>best@10</b> as the main evaluation metric, where 10 responses are sampled and evaluated for each question and the best score per question is recorded and summed up.
-              Throughout the evaluation, we set <b>sampling temperature T to be 0.2 and top p cut-off threshold to be 0.9</b>.
-              We leave the exploration of other hyperparameters as the future work.
-            </p>
-            <p>For score computation, we treat each question equally with one point each.
-              Given 270 questions in the benchmark, the full score is 270, and we by default report the percentage score (achieved score divided by the full score which is 270).
-              The one point for each question can be further decomposed into a few scoring points within each question.
-              For example, a question may contain four keywords with weights 2, 1, 1, and 1 each.
-              Then, matching each keyword can contribute to 0.4, 0.2, 0.2, and 0.2 points respectively to the final score.
-            </p>
-          </div>
-        </div>
-      </div>
-      <!--/ Perturbation and Prompt. -->
-    </div>
-  </section>
 
   <section class="section">
     <div class="container is-max-desktop">
         <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
           <h2 class="title is-3">Leaderboard</h2>
+
+          <div class="content has-text-justified">
+          <p>In this section, we furnish a comprehensive evaluation of both close-source LLMs such as GPT-4 and GPT-3.5, as well as widely-utilized open-source LLMs. In addition, we test our DAAgent, which is an agent for data analysis with instruction-tuning.</p>
+           <img src="static/images/main_results.png">
+           </div>
         </div>
       </div>
     </div>
-    <br>
-    <div class="container is-max-desktop has-text-justified">
+    <!-- <br> -->
+    <!-- <div class="container is-max-desktop has-text-justified">
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
           <div>
@@ -325,7 +213,7 @@ layout: default
           <p>Each blue point corresponds to one open-source model, with error bars for those smaller than 30B parameters. Proprietary models are plotted as lines with uncertainty ranges.</p>
         </div>
       </div>
-    </div>
+    </div> -->
     <div class="cover" id="contentCover">
       <!-- Baseline. -->
       <div class="container-t">
@@ -334,10 +222,14 @@ layout: default
             <div class="infoCard">
               <div class="infoBody">
                 <p align="left">
-                  <div class="left"><b>Notice</b>: we set the max tokens to generate=1024 (since GPT4 generates 662 tokens without the constraint, we provide some wiggle room by setting to 1024 tokens)
+                
+               
+                  <div class="left"><b>Notice</b>: We set temperature=0.2, top_p=1.0 and frequency_penalty=0.0 for all the models.  
+                  <!-- <p style="color: red;"> Regrettably, the open-source LLMs enumerated below are currently incapable of adhering to agent directives or delivering substantively meaningful responses. In response to this limitation, we have developed a Supervised Fine-Tuning (SFT) dataset aimed at refining these models. For these open-source LLMs, the table below delineates the enhancements observed post-finetuning. Please anticipate the imminent release of both the SFT dataset and its comprehensive details. </p> -->
                   </div>
+                 
                 </p>
-                <p align="left">
+                <!-- <p align="left">
                   <div class="left">We evenly split the 270 benchmark questions to 135-question dev set and 135-question test set. Dev set is publicly available, and the test set is on held where evaluation is available upon request (see below for instructions). 
                   Models are ranked according to full set scores.
                   </div>
@@ -345,7 +237,8 @@ layout: default
                 <p align="left">
                   <div class="left">For models with >30B parameters, we evaluate once due to resource limit, otherwise we evaluate three times and report the mean and standard deviation.
                   </div>
-                </p>
+                </p> -->
+	    <!--
                 <br>
                 <table class="table maintable stripe hover row-border order-column" id="maintable">
                   <thead>
@@ -353,13 +246,13 @@ layout: default
                       <th>Rank</th>
                       <th>Model Name</th>
                       <th># Params. (in B)</th>
-                      <th>Full Set Score</th>
-                      <th>Full Set Std</th>
-                      <th>Dev Set Score</th>
-                      <th>Dev Set Std</th>
+                      <th>Proportional Accuracy by Subquestions</th>
+                      <th>Accuracy by Questions</th>
+                      <th>Uniform Accuracy by Subquestions</th>
+                      <!-- <th>Dev Set Std</th>
                       <th>Test Set Score</th>
                       <th>Test Set Std</th>
-                      <th></th>
+                      <th></th> -->
                     </tr>
                   </thead>
                   <tbody>
@@ -372,12 +265,15 @@ layout: default
                         <td>{{ item.title }}</td>
                       {% endif %}
                       {% if item.size != null %}
-                        <td>{{ item.size }}</td>
+                        <td class="dt-center">{{ item.size }}</td>
                       {% else %}
-                        <td>/</td>
+                        <td class="dt-center">/</td>
                       {% endif %}
-                      <td>{{ item.score }}</td>
-                      {% if item.score_std != null %}
+                      <!-- <td>{{ item.score }}</td> -->
+                      <td class="dt-center">{{ item.aps_score }}</td>
+                      <td class="dt-center">{{ item.as_score }}</td>
+                      <td class="dt-center">{{ item.a_score }}</td>
+                      <!-- {% if item.score_std != null %}
                         <td>{{ item.score_std }}</td>
                       {% else %}
                         <td></td>
@@ -393,7 +289,7 @@ layout: default
                         <td>{{ item.testscore_std }}</td>
                       {% else %}
                         <td></td>
-                      {% endif %}
+                      {% endif %} -->
                       <td>{{ item.comment }}</td>
                     </tr>
                     {% endfor %}
@@ -406,63 +302,188 @@ layout: default
         </div>
       </div>
     </div>
+  </section> 
+-->
+
+
+  <section class="section">
+    <div class="container is-max-desktop">
+      <!-- Abstract. -->
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-four-fifths">
+          <h2 class="title is-3">Overview</h2>
+          <div class="content has-text-justified">
+            <p>
+              The advent of Large Language Models (LLMs) has spurred the development of LLM-augmented Autonomous Agents (LAAs). These agents are capable of generating and executing code through ongoing interactions between their core LLM and the code execution environment. In this project, we introduce InfiAgent-DABench, the first benchmark specifically designed to evaluate LLM-based agents in data analysis tasks. This benchmark contains DAEval, a dataset consisting of data analysis questions derived from CSV files, and an agent framework to evaluate LLMs as data analysis agents.
+              
+              <!-- we introduce Infinite Agent (InfiAgent), a LAA focused data analysis and code writing. We have developed an automatic evaluation benchmark (InfiAgent-Eval), which covers various data analysis topics such as summary statistics, correlation analysis, and data transformation, providing a comprehensive means for quantitatively assessing LAAs' performance.  -->
+              This page describes the details of InfiAgent-DABench framework, including features such as dataset construction, evaluation metrics, analytical assessment, and the procedural details about pipeline onboarding. 
+            </p>
+          </div>
+        </div>
+      </div>
+      <!--/ Abstract. -->
+    </div>
   </section>
 
+
+  <section class="section">
+    <div class="container is-max-desktop">
+      <!-- Example. -->
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-four-fifths">
+          <h2 class="title is-3">Dataset Construction</h2>
+          <div class="content has-text-justified">
+
+
+          We build data analysis query and response given existing csv files.  Here is the construction pipeline. 
+          <img src="static/images/dataset_construction_eval.png"> 
+            
+            We split the dataset into a validation set and a test set. The validation dataset contains 311 questions with 55 csv files. We only public the validation set to avoid data leakage. Here're some examples:
+	  <img src="static/images/question_examples.png"> 
+     	    We categorize CSV files within the dataset into nine distinct categories, determined by their respective domains:
+
+            <ul>
+              <li>Finance and Economics </li>
+              <li>Health and Medical</li>
+              <li>Demographics and Social Science</li>
+              <li>Marketing and Consumer Behavior</li>
+              <li>Energy and Environmental Monitoring</li>
+              <li>Transportation, Logistics, and Tourism</li>
+              <li>Culture, Entertainment, and Media</li>
+              <li>Scientific Research and Technology</li>
+              <li>Other Categories</li>
+            </ul>
+
+            <p>Below is the pie chart depicting the categorical distribution:</p>
+
+             <img src="static/images/domains.png">
+
+            <p>We conduct statistical analyses on the individual concepts associated with each question, accounting for scenarios where a question encompasses multiple concepts:</p>
+
+            <img src="static/images/concepts.png">
+
+          </div>
+        </div>
+      </div>
+      <!--/ Example. -->
+    </div>
+  </section>
+
+  
+  <section class="section">
+    <div class="container is-max-desktop">
+      <!-- Comparison. -->
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-four-fifths">
+          <h2 class="title is-3">Evaluation</h2>
+          <div class="content has-text-justified">
+
+            
+            <p>For closed-form questions, we prompt LLMs with question description. Considering that most models hardly follow the format requirements, we add a reformat step for all models by using gpt-3.5-turbo-16k to format the responses given the format requirements. Here's a figure illustrating this process: </p>
+		        <img src="static/images/case-study-eval-data.png">
+          </div>
+        </div>
+      </div>
+      <!--/ Comparison. -->
+    </div>
+  </section>
+
+  <section class="section">
+    <div class="container is-max-desktop">
+      <!-- Comparison. -->
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-four-fifths">
+          <h2 class="title is-3">Metrics</h2>
+          <div class="content has-text-justified">
+
+            <!-- <img src="static/images/leaderboard.jpeg"> -->
+            
+            For closed-form questions, we have defined the following metrics:
+
+Proportional Accuracy by Subquestions (PASQ):
+$$
+\text{PSAQ} = \frac{1}{N} \sum_{i=1}^{N} \left( \frac{1}{M_i} \sum_{j=1}^{M_i} I_{ij} \right)
+$$
+Here, $N$ is the total number of questions, $M_i$ is the number of subquestions for the i-th question, and $I_{ij}$ is the indicator function for the j-th subquestion of the i-th question.
+Accuracy by Questions (ABQ):
+$$
+\text{ABQ} = \frac{1}{N} \sum_{i=1}^{N} \left( \prod_{j=1}^{M_i} I_{ij} \right)
+$$
+In this expression, the product 
+$\prod_{j=1}^{M_i} I_{ij}$ equals 1 if all subquestions of the \(i\)-th question are answered correctly, and 0 otherwise.
+Uniform Accuracy by Subquestions (UASQ):
+$$
+\text{UASQ} = \frac{1}{\sum_{i=1}^{N} M_i} \sum_{i=1}^{N} \sum_{j=1}^{M_i} I_{ij}
+$$
+Here, the total accuracy is the sum of the values of the indicator function across all subquestions, normalized by the total number of subquestions in the dataset.
+          </div>
+        </div>
+      </div>
+      <!--/ Comparison. -->
+    </div>
+  </section>
+
+  
+
+  
   <section class="section">
     <div class="container is-max-desktop">
       <!-- Benchmarking Tutorial -->
       <div class="columns is-centered has-text-centered">
         <div class="column is-four-fifths">
-          <h2 class="title is-3">Try the Benchmark!</h2>
+          <h2 class="title is-3">Pipeline Onboarding</h2>
           <div class="content has-text-justified">
-            <h3>Step 0: Setup</h3>
+            <h3>Setup</h3>
             <ol>
-              <li>Convert or save your model weights in Hugging Face Transformers format.</li>
-              <li>Clone the two repositories: <a href="https://github.com/infi-coder/ffqa-evaluation-harness">Inference Repo</a> and <a href="https://github.com/infi-coder/inficoder-eval">Evaluation Repo</a>.</li>
-              <p>No requirement on the local directory paths.</p>
-              <li>Set global environment variable: </li>
-              <div class="highlight"><code>export INFERENCE_REPO_PATH=[evaluation repo]/batched_prompts/suite_v2.0.0_dev.csv</code></div>
+		<li>Please initialize the environment using the following code:</li>
+              <pre><code>conda create -n agent python==3.9.12
+pip3 install -r requirements.txt</code></pre>
+		    <li>We also build a Python code sandbox in our pipeline based on docker, you can use the following code to build your docker image:</li>
+		    <pre><code>docker build -t myimg .</code></pre>
             </ol>
             <br>
-            <h3>Step 1: Generate Response for Your Model</h3>
+            <h3>Example 1: Demo Usage</h3>
             <ol>
-              <li>Set the working directory to <a href="https://github.com/infi-coder/ffqa-evaluation-harness">Inference Repo</a>.</li>
-              <p>The inference repo is forked and slightly modified from <a href="https://github.com/bigcode-project/bigcode-evaluation-harness">bigcode-evaluation-harness</a> framework. We leverage its function for inference.</p> 
-              <li>Determine the prompt format to use, which corresponds to task name.</li>
-              <p>We support these format for now: <code>code-ffqa-v2</code> (the default one, system + '\n' + content), <code>code-ffqa-v2-endn</code> (system + '\n' + content + '\n'), <code>code-ffqa-v2-deepseek-chat</code> (deepseek-coder-instruct format), <code>code-ffqa-v2-baichuan2</code> (baichuan2 models format), <code>code-ffqa-v2-zypher</code> (zypher-7b-beta format), <code>code-ffqa-v2-octo</code> (octopack model format), <code>code-ffqa-v2-wizard</code> (wizard-python model format), <code>code-ffqa-v2-phi</code> (phi-1.5 model format), and <code>code-ffqa-v2-inficoder</code> (our InfiCoder model format).</p>
-              <p>Feel free to contribute by adding your model format, which is easy - just slightly modify <code>bigcode_eval/tasks/code_ffqa_v200.py</code> a bit.</p> 
-              <li>Run batch inference to generate responses for question prompts.</li>
-              <div class="highlight"><code>accelerate launch [inference repo dir]/main.py --model [your model path / hugging face hub path] --tasks [determined task name above] --batch_size [batch_size] --precision bf16 --n_samples 30 --do_sample True --temperature 0.2 --top_p 0.9 --save_generations --save_references --trust_remote_code --generation_only --max_new_tokens 1024 --save_generations_path [output raw response file path].json --eos='[EOS string]'</code></div>
-              <p>This command will output two files in your working directory: <code>[output raw response file path].json</code> which stores responses and <code>references.json</code> which stores case names as the index.</p>
-              <li>Export responses and case names to evaluation-capatible csv file.</li>
-              <div class="highlight"><code>python3 [inference repo dir]/ffqa_processor.py [output raw response file path].json references.json [response csv file].csv</code></div>
-              <p>This command will join the two output files above into one csv file <code>[response csv file].csv</code> which can be processed by the evaluation framework below.</p>
+              <li>You can easily use the following command to start a demo using APIs.</li>
+		    <pre><code># initialize poetry
+poetry init
+# Supported LLM: OPEN_AI, AZURE_OPEN_AI
+# api_key is required for API-based models
+bash run_demo.sh --llm AZURE_OPEN_AI --api_key 123</code></pre>
+              <li>After running the above code, an interactive frontend interface will be displayed.</li> 
+		    <img src="static/images/demo_fig.png">
+              <li>You can enter prompts in the dialogue box, and if you need to upload a file, you can select the file for upload in the "browse files" section.</li>
+              <li>Click the "run code interpreter" button, and the backend will execute our pipeline. The agent will generate code and execute it in the sandbox, and the results will be returned on the interactive page upon completion.</li>
             </ol>
             <br>
-            <h3>Step 2: Evaluation (Dev Set)</h3>
+            <h3>Example 2: Running With Local Models</h3>
             <ol>
-              <li>Setup the evaluation framework: <a href="https://github.com/infi-coder/inficoder-eval">Evaluation Repo</a>.</li>
+              <li>Our local LLM service is developed on vLLM. First, you can start a vLLM model serving by running this command:</li>
+		    <pre><code># Take llama-2-7b as an example
+python3 src/activities/vllm_api_server.py --model "meta-llama/Llama-2-7b-hf"  --served_model_name "meta-llama/Llama-2-7b-hf"</code></pre>
               <p>At this point, we only support Linux environment.</p>
-              <p>Run <code>pip3 install -r requirements.txt</code>, then <code>./setup.sh</code> (time costly, usually 1-2 hours) which installs necessary compilers and packages for multi-lingual execution environment.</p>
-              <li>Check the evaluation environment.</li>
-              <p>Run <code>python3 env_check.py</code> to check and fix the environment incompatibility according to the console output. If the console output is "You're good to go.", then we can proceed.</p>
-              <li>Unpack the csv output.</li>
-              <p>Unpack the csv output file from the previous inference step into a directory where each response is stored in a separate txt.</p>
-              <div class="highlight"><code>python3 adaptors/csv_response_unpacker.py [response csv file].csv [response save dir]</code></div>
-              <p>We recommend to save the responses in a directory in <code>responses/</code>, i.e., let <code>[response save dir]=responses/...</code>. The above script will create the <code>[response save dir]</code> directory if it does not exist.</p>
-              <li>Run main evaluation.</li>
-              <div class="highlight"><code>python3 grader_main.py suite_v2.0.0_dev.yaml [response save dir]</code></div>
-              <p>The evaluation takes around 15 min - 45 min.</p>
-              <p>When it finishes, there are two output files: <code>results/suite_v2.0.0_dev_[response save dir base name].txt</code> (short summary) and <code>results/suite_v2.0.0_dev_[response save dir base name].yaml</code> (all details).</p>
-              <p>You can also customized the output paths by <code>--result_summary_path</code> and <code>--result_detail_path</code> arguments respectively.</p>
-              <li>Get statistics and print the results.</li>
-              <div class="highlight"><code>python3 print_result_stat.py [result detail path] [summary txt path]</code></div>
-              <p>In console output and <code>[summary txt path]</code>, a nice table will be printed, including the overall score and percentage and the sub-scores for each question type, metric type, and programming language.</p>
+              <li>You can try this command if the serving is successfully starting:</li>
+	      <pre><code>curl http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "meta-llama/Llama-2-7b-hf",
+        "prompt": "San Francisco is a",
+        "max_tokens": 7,
+        "temperature": 0
+    }'</code></pre>
+              <li>Then you can run the demo following this command:</li>
+	      <pre><code>bash run_demo.sh --llm "meta-llama/Llama-2-7b-hf"</code></pre>
             </ol>
-            <h3>Step 3: Evaluate (Test Set)</h3>
-            <p>
-              Available upon request (<a href='mailto:linyi.li@bytedance.com'>email us</a>).
-            </p>
+            <br>
+            <h3>Example 3: Running Without Front-end</h3>
+	<ol>
+		<p>Our demo is designed to default to the use of the front-end. If you prefer not to use the front-end and instead perform command-line operations or process large amounts of data, you can refer to the following commands:</p>
+		<pre><code># Run with API.
+python3 ./src/activities/eval.py --llm AZURE_OPEN_AI --api_key 123
+# Run with local model.Take llama-2-7b as an example.
+python3 ./src/activities/eval.py --llm "meta-llama/Llama-2-7b-hf"</code></pre>
+	</ol>
           </div>
         </div>
       </div>
@@ -483,12 +504,13 @@ layout: default
     <div class="container is-max-desktop content">
       <div class="bibtex-body">
         <h2 class="title">BibTeX</h2>
-        <pre><code>@misc{li2023inficodereval,
-  author = {InfiCoderTeam},
-  title = {InfiCoder-Eval: Systematically Evaluating Question-Answering for Code Large Language Models},
-  year = {2023},
-  publisher = {Github Pages},
-  howpublished = "\url{https://github.com/infi-coder/inficoder-eval}"
+        <pre><code>@misc{hu2024infiagentdabench,
+      title={InfiAgent-DABench: Evaluating Agents on Data Analysis Tasks}, 
+      author={Xueyu Hu and Ziyu Zhao and Shuang Wei and Ziwei Chai and Guoyin Wang and Xuwu Wang and Jing Su and Jingjing Xu and Ming Zhu and Yao Cheng and Jianbo Yuan and Kun Kuang and Yang Yang and Hongxia Yang and Fei Wu},
+      year={2024},
+      eprint={2401.05507},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
 }</code></pre>
       </div>
     </div>
@@ -498,12 +520,12 @@ layout: default
   <footer class="footer">
     <div class="container">
       <div class="content has-text-centered">
-        <!-- <a class="icon-link" href="https://arxiv.org/pdf/2211.11501">
+        <!-- <a class="icon-link" href="./static/report/inficoder_eval_report_draft.pdf">
           <i class="fas fa-file-pdf" style="color:white"></i>
-        </a> -->
+        </a>
         <a class="icon-link" href="https://github.com/infi-coder" class="external-link" disabled>
           <i class="fab fa-github" style="color:white"></i>
-        </a>
+        </a> -->
       </div>
       <div class="columns is-centered">
         <div class="column is-8">
@@ -514,7 +536,7 @@ layout: default
                 Commons Attribution-ShareAlike 4.0 International License</a>.
             </p>
             <p>
-              This means you are free to borrow the <a href="https://github.com/infi-coder/inficoder-eval.github.io">source
+              This means you are free to borrow the <a href="https://github.com/infi-coder/inficoder-eval">source
                 code</a> of this website,
               we just ask that you link back to this page in the footer.
             </p>
@@ -526,7 +548,7 @@ layout: default
   
   <script>
     $(document).ready( function () {
-      $('.mainTable').DataTable({ordering: true, order: [[3, 'desc']], columns: [{ "type": "num" },{ "type": "html" },{ "type": "num" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "html", "orderable": false }]});
+      $('.mainTable').DataTable({ordering: true, order: [[3, 'desc']], columns: [{ "type": "num" },{ "type": "html" },{ "type": "num" },{ "type": "num-fmt" },{ "type": "num-fmt" },{ "type": "num-fmt" }]});
     } );
   </script>
 
diff --git a/static/images/all_results.png b/static/images/all_results.png
deleted file mode 100644
index d000c7c..0000000
Binary files a/static/images/all_results.png and /dev/null differ
diff --git a/static/images/case-study-eval-data.png b/static/images/case-study-eval-data.png
new file mode 100644
index 0000000..ff320b7
Binary files /dev/null and b/static/images/case-study-eval-data.png differ
diff --git a/static/images/comparison.png b/static/images/comparison.png
deleted file mode 100644
index 56b4a28..0000000
Binary files a/static/images/comparison.png and /dev/null differ
diff --git a/static/images/concepts.png b/static/images/concepts.png
new file mode 100644
index 0000000..95213f7
Binary files /dev/null and b/static/images/concepts.png differ
diff --git a/static/images/data_domain_stats.png b/static/images/data_domain_stats.png
deleted file mode 100644
index 5a8b743..0000000
Binary files a/static/images/data_domain_stats.png and /dev/null differ
diff --git a/static/images/data_examples.png b/static/images/data_examples.png
deleted file mode 100644
index 47e236b..0000000
Binary files a/static/images/data_examples.png and /dev/null differ
diff --git a/static/images/dataset_construction_eval.png b/static/images/dataset_construction_eval.png
new file mode 100644
index 0000000..ba0596f
Binary files /dev/null and b/static/images/dataset_construction_eval.png differ
diff --git a/static/images/demo_fig.png b/static/images/demo_fig.png
new file mode 100644
index 0000000..8d9f082
Binary files /dev/null and b/static/images/demo_fig.png differ
diff --git a/static/images/domains.png b/static/images/domains.png
new file mode 100644
index 0000000..2058b05
Binary files /dev/null and b/static/images/domains.png differ
diff --git a/static/images/framework.png b/static/images/framework.png
new file mode 100644
index 0000000..2313ecb
Binary files /dev/null and b/static/images/framework.png differ
diff --git a/static/images/general_statistics.png b/static/images/general_statistics.png
deleted file mode 100644
index 7611020..0000000
Binary files a/static/images/general_statistics.png and /dev/null differ
diff --git a/static/images/infiagent_logo.png b/static/images/infiagent_logo.png
new file mode 100644
index 0000000..f534ef1
Binary files /dev/null and b/static/images/infiagent_logo.png differ
diff --git a/static/images/inficoder-eval-main.png b/static/images/inficoder-eval-main.png
deleted file mode 100644
index 5fed41b..0000000
Binary files a/static/images/inficoder-eval-main.png and /dev/null differ
diff --git a/static/images/inficoder_eval_logo.png b/static/images/inficoder_eval_logo.png
deleted file mode 100644
index 509db99..0000000
Binary files a/static/images/inficoder_eval_logo.png and /dev/null differ
diff --git a/static/images/inficoder_eval_logo2.jpeg b/static/images/inficoder_eval_logo2.jpeg
deleted file mode 100644
index 9c3d292..0000000
Binary files a/static/images/inficoder_eval_logo2.jpeg and /dev/null differ
diff --git a/static/images/inficoder_eval_logo2.png b/static/images/inficoder_eval_logo2.png
deleted file mode 100644
index 07a81cd..0000000
Binary files a/static/images/inficoder_eval_logo2.png and /dev/null differ
diff --git a/static/images/main_results.png b/static/images/main_results.png
new file mode 100644
index 0000000..29dc874
Binary files /dev/null and b/static/images/main_results.png differ
diff --git a/static/images/question_examples.png b/static/images/question_examples.png
new file mode 100644
index 0000000..901c032
Binary files /dev/null and b/static/images/question_examples.png differ
diff --git a/static/images/spider.png b/static/images/spider.png
new file mode 100644
index 0000000..827aa16
Binary files /dev/null and b/static/images/spider.png differ
diff --git a/static/report/inficoder_eval_report_draft.pdf b/static/report/inficoder_eval_report_draft.pdf
index 0fb600c..2344bee 100644
Binary files a/static/report/inficoder_eval_report_draft.pdf and b/static/report/inficoder_eval_report_draft.pdf differ