diff --git a/README.md b/README.md index edb639d..1eea004 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# InfiCoder-Eval +# InfiAgent -This is the repository that contains source code for the [InfiCoder-Eval](https://github.com/infi-coder/inficoder-eval). +This is the repository that contains source code for the [InfiAgent](https://infi-coder.github.io/inficoder-eval/) website. ``` diff --git a/_config.yaml b/_config.yaml new file mode 100644 index 0000000..9408c5a --- /dev/null +++ b/_config.yaml @@ -0,0 +1,91 @@ +# Welcome to Jekyll! +# +# This config file is meant for settings that affect your whole blog, values +# which you are expected to set up once and rarely edit after that. If you find +# yourself editing this file very often, consider using Jekyll's data files +# feature for the data you need to update frequently. +# +# For technical reasons, this file is *NOT* reloaded automatically when you use +# 'bundle exec jekyll serve'. If you change this file, please restart the server process. +# +# If you need help with YAML syntax, here are some quick references for you: +# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml +# https://learnxinyminutes.com/docs/yaml/ +# +# Site settings +# These are used to personalize your new site. If you look in the HTML files, +# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. +# You can create any custom variable you would like, and they will be accessible +# in the templates via {{ site.myvariable }}. + +title: "InfiCoder-Eval: Systematically Evaluating Question-Answering" +# email: anonymous@github.com +description: >- # this means to ignore newlines until "baseurl:" + This is the website for the InfiCoder-Eval benchmark +baseurl: "/inficoder-eval" # the subpath of your site, e.g. /blog +url: "https://infi-coder.github.io/" # the base hostname & protocol for your site, e.g. http://example.com +# twitter_username: sokcertifiedrobustness +github_username: llylly + +# Build settings +# theme: minima +# plugins: + # - jekyll-feed + + +minima: + date_format: "%b %-d, %Y" + + # generate social links in footer + # social_links: + # twitter: jekyllrb + # github: sokcertifiedrobustness + # devto: jekyll + # dribbble: jekyll + # facebook: jekyll + # flickr: jekyll + # instagram: jekyll + # linkedin: jekyll + # pinterest: jekyll + # youtube: jekyll + # youtube_channel: UC8CXR0-3I70i1tfPg1PAE1g + # youtube_channel_name: CloudCannon + # telegram: jekyll + # googleplus: +jekyll + # microdotblog: jekyll + # keybase: jekyll + + # Mastodon instances + # mastodon: + # - username: jekyll + # instance: example.com + # - username: jekyll2 + # instance: example.com + + # GitLab instances + # gitlab: + # - username: jekyll + # instance: example.com + # - username: jekyll2 + # instance: example.com + +# Exclude from processing. +# The following items will not be processed, by default. +# Any item listed under the `exclude:` key here will be automatically added to +# the internal "default list". +# +# Excluded items can be processed by explicitly listing the directories or +# their entries' file path in the `include:` list. +# +# exclude: +# - index.html +# - .sass-cache/ +# - .jekyll-cache/ +# - gemfiles/ +# - Gemfile +# - Gemfile.lock +# - node_modules/ +# - vendor/bundle/ +# - vendor/cache/ +# - vendor/gems/ +# - vendor/ruby/ \ No newline at end of file diff --git a/_data/leaderboard.yml b/_data/leaderboard.yml index 5be2082..dc98cf1 100644 --- a/_data/leaderboard.yml +++ b/_data/leaderboard.yml @@ -1,365 +1,178 @@ records: - comment: null - devscore: 61.28% - devscore_std: 1.45% + aps_score: 74.60% + as_score: 78.72% + a_score: 79.01% link: https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo rank: 1 - score: 58.89% - score_std: 0.74% size: null - testscore: 56.49% - testscore_std: 0.19% - title: GPT-4 (0613) + title: gpt-4-0613 - comment: null - devscore: 50.13% - devscore_std: null - link: https://deepseekcoder.github.io/ + aps_score: 60.77% + as_score: 67.50% + a_score: 62.98% + link: https://arxiv.org/abs/2401.05507 rank: 2 - score: 50.34% - score_std: null - size: 33 - testscore: 50.55% - testscore_std: null - title: deepseek-coder-33b-instruct + size: 34 + title: daagent-34b - comment: null - devscore: 46.16% - devscore_std: 0.72% - link: https://platform.openai.com/docs/models/gpt-3-5 + aps_score: 60.13% + as_score: 65.94% + a_score: 64.27% + link: https://api.minimax.chat/ rank: 3 - score: 46.70% - score_std: 0.70% size: null - testscore: 47.24% - testscore_std: 1.75% - title: GPT-3.5-turbo (0613) + title: abab5.5-chat - comment: null - devscore: 46.52% - devscore_std: null - link: https://huggingface.co/WizardLM/WizardCoder-Python-34B-V1.0 + aps_score: 58.20% + as_score: 65.70% + a_score: 61.88% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 4 - score: 45.16% - score_std: null - size: 34 - testscore: 43.80% - testscore_std: null - title: WizardCoder-Python-34B-V1.0 + size: null + title: gpt-3.5-turbo-0613 - comment: null - devscore: 44.03% - devscore_std: null - link: https://github.com/facebookresearch/codellama + aps_score: 57.56% + as_score: 62.46% + a_score: 56.17% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 5 - score: 43.71% - score_std: null - size: 34 - testscore: 43.39% - testscore_std: null - title: CodeLlama-34B-Instruct + size: 72 + title: qwen-72b-chat - comment: null - devscore: 42.28% - devscore_std: 0.24% - link: https://deepseekcoder.github.io/ + aps_score: 54.52% + as_score: 60.51% + a_score: 55.72% + link: https://arxiv.org/abs/2401.05507 rank: 6 - score: 42.97% - score_std: 0.22% - size: 6.7 - testscore: 43.66% - testscore_std: 0.26% - title: deepseek-coder-6.7b-instruct + size: 13 + title: daagent-13b - comment: null - devscore: 41.46% - devscore_std: 1.13% - link: https://huggingface.co/WizardLM/WizardCoder-Python-13B-V1.0 + aps_score: 53.38% + as_score: 58.32% + a_score: 51.93% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 7 - score: 41.22% - score_std: 0.75% - size: 13 - testscore: 40.97% - testscore_std: 0.42% - title: WizardCoder-Python-13B-V1.0 + size: null + title: gemini-pro - comment: null - devscore: 43.70% - devscore_std: 2.09% - link: https://huggingface.co/WizardLM/WizardCoder-Python-7B-V1.0 + aps_score: 49.20% + as_score: 54.02% + a_score: 51.38% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 8 - score: 40.30% - score_std: 1.15% - size: 7 - testscore: 36.90% - testscore_std: 0.22% - title: WizardCoder-Python-7B-V1.0 + size: 46.7(12.9) + title: mixtral-8x7b-instruct-v0.1 - comment: null - devscore: 39.71% - devscore_std: null - link: https://github.com/facebookresearch/codellama + aps_score: 49.02% + as_score: 57.63% + a_score: 54.19% + link: https://arxiv.org/abs/2401.05507 rank: 9 - score: 39.75% - score_std: null - size: 34 - testscore: 39.79% - testscore_std: null - title: CodeLlama-34B + size: 7 + title: daagent-7b - comment: null - devscore: 41.20% - devscore_std: 0.25% - link: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta + aps_score: 44.84% + as_score: 48.90% + a_score: 46.31% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 10 - score: 39.59% - score_std: 0.68% - size: 7 - testscore: 37.97% - testscore_std: 1.29% - title: Zypher-7b-beta + size: 33 + title: deepseek-coder-33b-instruct - comment: null - devscore: 35.45% - devscore_std: 0.56% - link: https://huggingface.co/bigcode/octocoder + aps_score: 43.41% + as_score: 49.65% + a_score: 46.96% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 11 - score: 37.72% - score_std: 0.58% - size: 15.5 - testscore: 40.00% - testscore_std: 1.69% - title: OctoCoder + size: null + title: claude-2.1 - comment: null - devscore: 36.03% - devscore_std: 1.37% - link: https://github.com/facebookresearch/codellama + aps_score: 42.02% + as_score: 47.41% + a_score: 44.40% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 12 - score: 37.18% - score_std: 0.51% - size: 13 - testscore: 38.34% - testscore_std: 0.36% - title: CodeLlama-13B-Instruct + size: 34 + title: phind-codellama-34b-v2 - comment: null - devscore: 36.19% - devscore_std: 0.75% - link: https://huggingface.co/Qwen/Qwen-14B-Chat + aps_score: 39.87% + as_score: 45.46% + a_score: 42.73% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 13 - score: 36.93% - score_std: 0.12% - size: 14 - testscore: 37.68% - testscore_std: 0.91% - title: Qwen-14B-Chat + size: 34 + title: xwincoder-34b - comment: null - devscore: 31.78% - devscore_std: 1.01% - link: https://github.com/facebookresearch/codellama + aps_score: 36.45% + as_score: 41.61% + a_score: 36.90% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 14 - score: 33.92% - score_std: 0.64% - size: 13 - testscore: 36.06% - testscore_std: 0.49% - title: CodeLlama-13B + size: 7 + title: mistral-7b-instruct-v0.2 - comment: null - devscore: 31.85% - devscore_std: 0.60% - link: https://huggingface.co/WizardLM/WizardCoder-15B-V1.0 + aps_score: 36.45% + as_score: 41.36% + a_score: 34.51% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 15 - score: 33.34% - score_std: 0.74% - size: 15 - testscore: 34.83% - testscore_std: 1.86% - title: WizardCoder-15B-V1.0 + size: 14 + title: qwen-14b-chat - comment: null - devscore: 32.10% - devscore_std: null - link: https://github.com/facebookresearch/codellama + aps_score: 27.27% + as_score: 27.27% + a_score: 16.00% + link: https://platform.openai.com/docs/models/gpt-3-5 rank: 16 - score: 33.23% - score_std: null - size: 34 - testscore: 34.35% - testscore_std: null - title: CodeLlama-34B-Python -- comment: null - devscore: 31.55% - devscore_std: 2.01% - link: https://huggingface.co/bigcode/octogeex - rank: 17 - score: 32.60% - score_std: 1.02% - size: 6 - testscore: 33.65% - testscore_std: 0.91% - title: OctoGeeX -- comment: null - devscore: 31.87% - devscore_std: 0.66% - link: https://huggingface.co/Qwen/Qwen-7B-Chat - rank: 18 - score: 32.48% - score_std: 0.71% - size: 7 - testscore: 33.10% - testscore_std: 0.77% - title: Qwen-7B-Chat -- comment: null - devscore: 31.09% - devscore_std: 1.85% - link: https://github.com/facebookresearch/codellama - rank: 19 - score: 32.43% - score_std: 0.42% - size: 13 - testscore: 33.78% - testscore_std: 1.02% - title: CodeLlama-13B-Python -- comment: null - devscore: 30.05% - devscore_std: 1.79% - link: https://github.com/facebookresearch/codellama - rank: 20 - score: 31.07% - score_std: 0.87% size: 7 - testscore: 32.09% - testscore_std: 0.05% - title: CodeLlama-7B -- comment: null - devscore: 31.33% - devscore_std: 0.80% - link: https://huggingface.co/WizardLM/WizardCoder-3B-V1.0 - rank: 21 - score: 30.94% - score_std: 0.60% - size: 3 - testscore: 30.56% - testscore_std: 0.58% - title: WizardCoder-3B-V1.0 + title: qwen-7b-chat - comment: null - devscore: 30.15% - devscore_std: 0.73% - link: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat - rank: 22 - score: 30.34% - score_std: 0.76% + aps_score: 26.26% + as_score: 30.88% + a_score: 26.31% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 17 size: 13 - testscore: 30.52% - testscore_std: 1.32% - title: Baichuan2-13B-Chat -- comment: null - devscore: 27.96% - devscore_std: 0.81% - link: https://github.com/facebookresearch/codellama - rank: 23 - score: 29.51% - score_std: 0.97% - size: 7 - testscore: 31.06% - testscore_std: 1.18% - title: CodeLlama-7B-Instruct -- comment: null - devscore: 27.51% - devscore_std: 1.09% - link: https://github.com/facebookresearch/codellama - rank: 24 - score: 28.88% - score_std: 0.45% - size: 7 - testscore: 30.24% - testscore_std: 1.97% - title: CodeLlama-7B-Python + title: vicuna-13b-v1.5 - comment: null - devscore: 26.53% - devscore_std: 1.12% - link: https://huggingface.co/WizardLM/WizardCoder-1B-V1.0 - rank: 25 - score: 27.11% - score_std: 0.85% - size: 1 - testscore: 27.70% - testscore_std: 0.72% - title: WizardCoder-1B-V1.0 -- comment: null - devscore: 28.74% - devscore_std: 0.78% - link: https://huggingface.co/bigcode/starcoder - rank: 26 - score: 26.79% - score_std: 0.18% - size: 15.5 - testscore: 24.84% - testscore_std: 0.96% - title: StarCoder -- comment: null - devscore: 24.45% - devscore_std: 1.55% - link: https://huggingface.co/bigcode/starcoderplus - rank: 27 - score: 26.07% - score_std: 1.25% - size: 15.5 - testscore: 27.69% - testscore_std: 2.13% - title: StarCoderPlus + aps_score: 23.79% + as_score: 26.82% + a_score: 25.05% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 17 + size: 20 + title: internlm-chat-20b - comment: null - devscore: 23.36% - devscore_std: 1.77% - link: https://huggingface.co/Salesforce/codegen25-7b-instruct - rank: 28 - score: 25.67% - score_std: 1.57% - size: 7 - testscore: 27.98% - testscore_std: 1.46% - title: CodeGen2.5-7B-instruct + aps_score: 23.13% + as_score: 26.45% + a_score: 22.40% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 17 + size: 34 + title: wizardcoder-python-34b-v1.0 - comment: null - devscore: 25.44% - devscore_std: 0.02% - link: https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat - rank: 29 - score: 24.39% - score_std: 0.25% + aps_score: 16.99% + as_score: 20.71% + a_score: 17.89% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 18 size: 7 - testscore: 23.34% - testscore_std: 0.50% - title: Baichuan2-7B-Chat -- comment: null - devscore: 17.47% - devscore_std: 1.56% - link: https://platform.openai.com/docs/models/gpt-base - rank: 30 - score: 19.08% - score_std: 1.00% - size: / - testscore: 20.70% - testscore_std: 0.70% - title: davinci-002 + title: agentlm-7b - comment: null - devscore: 14.10% - devscore_std: 0.84% - link: https://huggingface.co/microsoft/phi-1_5 - rank: 31 - score: 16.63% - score_std: 0.03% - size: 1.5 - testscore: 19.16% - testscore_std: 0.87% - title: phi-1.5 -- comment: null - devscore: 16.20% - devscore_std: 0.65% - link: https://github.com/THUDM/CodeGeeX2 - rank: 32 - score: 16.50% - score_std: 0.39% + aps_score: 16.67% + as_score: 20.59% + a_score: 19.27% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 19 size: 6 - testscore: 16.80% - testscore_std: 0.33% - title: CodeGeeX2 + title: chatglm3-6b - comment: null - devscore: 11.45% - devscore_std: 0.19% - link: https://huggingface.co/microsoft/phi-1 - rank: 33 - score: 12.84% - score_std: 0.73% - size: 1.3 - testscore: 14.23% - testscore_std: 1.28% - title: phi-1 -settings: main + aps_score: 14.56% + as_score: 17.39% + a_score: 13.94% + link: https://platform.openai.com/docs/models/gpt-3-5 + rank: 20 + size: 34 + title: codellama-34b-instruct +# settings: main diff --git a/_layout/mydefault.html b/_layout/mydefault.html new file mode 100644 index 0000000..f35da65 --- /dev/null +++ b/_layout/mydefault.html @@ -0,0 +1,2 @@ + +{{ content }} \ No newline at end of file diff --git a/_site/README.md b/_site/README.md index 56bbbd7..1eea004 100644 --- a/_site/README.md +++ b/_site/README.md @@ -1,6 +1,6 @@ -# DS-1000 +# InfiAgent -This is the repository that contains source code for the [DS-1000 website](https://ds1000-code-gen.github.io). +This is the repository that contains source code for the [InfiAgent](https://infi-coder.github.io/inficoder-eval/) website. ``` diff --git a/_site/github_token.txt b/_site/github_token.txt new file mode 100644 index 0000000..310fced --- /dev/null +++ b/_site/github_token.txt @@ -0,0 +1 @@ +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 \ No newline at end of file diff --git a/_site/index.html b/_site/index.html index 4d4d5c0..25fb55b 100644 --- a/_site/index.html +++ b/_site/index.html @@ -2,12 +2,10 @@ - - + + - InfiCoder-Eval: Systematically Evaluating Question-Answering - for Code Large Language Models + InfiAgent: Building and Evaluating Agents on Data Analysis Tasks @@ -32,12 +30,25 @@ - + + + + + @@ -52,7 +63,7 @@