3dtown.github.io/index.html at main · UCSB-AI/3dtown.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="SceneFuse-3D: Constructing a 3D Scene from a Single Image">
  <meta name="keywords" content="3D Scene Generation, 3D Town, Image to 3D">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>SceneFuse-3D: Constructing a 3D Scene from a Single Image</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="icon" href="./static/images/icon.png">
  <link rel="stylesheet" href="./static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">Constructing a 3D Scene from a Single Image</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://kzzheng.github.io/">Kaizhi Zheng</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://www.linkedin.com/in/ruijian-zha/">Ruijian Zha</a><sup>3,4</sup>,</span>
            <span class="author-block">
              <a href="">Zishuo Xu</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://g-jing.github.io">Jing Gu</a><sup>1</sup>,</span>
            <span class="author-block">
              <a href="https://www.linkedin.com/in/cnjieyang">Jie Yang</a><sup>4</sup>,</span>
            <span class="author-block">
              <a href="https://eric-xw.github.io/">Xin Eric Wang</a><sup>1,2</sup>,</span>
          </div>
          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>University of California, Santa Cruz,</span>
            <span class="author-block"><sup>2</sup>University of California, Santa Barbara,</span>
            <span class="author-block"><sup>3</sup>Columbia University,</span>
            <span class="author-block"><sup>4</sup>Utopai Studios</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href="https://arxiv.org/pdf/2505.15765"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://arxiv.org/abs/2505.15765"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <!-- Code Link. -->
              <!-- <span class="link-block">
                <a href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code (Releasing Soon)</span>
                  </a>
              </span> -->
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img id="teaser" width="150%" src="./static/images/3D_Town_teaser.png">
      <h2 class="subtitle has-text-centered">
        <p style="font-family:Times New Roman">Figure 1. <b>3D Scene Generation from a Single Image.</b> Given a top-down reference image (center), <b>SceneFuse-3D</b> generates coherent and realistic 3D scenes that preserves geometry, texture, and layout compared to other state-of-the-art image-to-3D generation models. Our method also generalizes across diverse styles (right), producing high-quality outputs without any 3D training.</p>
      </h2>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p style="font-family:Times New Roman">
            Acquiring detailed 3D scenes typically demands costly equipment, multi-view data, or labor-intensive modeling. Therefore, a lightweight alternative, generating complex 3D scenes from a single top-down image, plays an essential role in real-world applications. While recent 3D generative models have achieved remarkable results at the object level, their extension to full-scene generation often leads to inconsistent geometry, layout hallucinations, and low-quality meshes.
In this work, we introduce <b>SceneFuse-3D</b>, a training-free framework designed to synthesize realistic and coherent 3D scenes from a single top-down view. Our method is grounded in two principles: region-based generation to improve image-to-3D alignment and resolution, and spatial-aware 3D inpainting to ensure global scene coherence and high-quality geometry generation. Specifically, we decompose the input image into overlapping regions and generate each using a pretrained 3D object generator, followed by a masked rectified flow inpainting process that fills in missing geometry while maintaining structural continuity. This modular design allows us to overcome resolution bottlenecks and preserve spatial structure without requiring 3D supervision or fine-tuning.
Extensive experiments across diverse scenes show that SceneFuse-3D outperforms state-of-the-art baselines, including Trellis, Hunyuan3D-2, and TripoSG, in terms of geometry quality, spatial coherence, and texture fidelity. Our results demonstrate that high-quality 3D scene generation is achievable from a single image using a principled, training-free approach.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-five-fifths">
        <h2 class="title is-3"><img id="painting_icon" width="5%" src="https://cdn-icons-png.flaticon.com/512/5379/5379860.png"> Spatial-aware Region-based 3D Scene Generation </h2>
      </div>
    </div>

        <div class="columns is-centered has-text-centered">
          <div class="column is-six-fifths">
            <div class="content has-text-justified">
              <ul>
                <li>We propose SceneFuse-3D, a training-free framework for generating structured 3D scenes from a single top-down image, leveraging pretrained object-centric generators for zero-shot scene asset synthesis. </li>
                <li>We develop a modular generation strategy that combines region-wise latent synthesis with spatial-aware 3D inpainting, effectively addressing resolution bottlenecks, image-geometry misalignment, and inter-region inconsistency. </li>
                <li>We conduct comprehensive evaluations on diverse scenes and show that SceneFuse-3D outperforms state-of-the-art baselines (Trellis, Hunyuan3D-2, and TripoSG) in geometry quality, layout coherence, and texture realism under both human and GPT-4o-based assessments.</li>
              </ul>
            </div>
            <img id="model" width="100%" src="./static/images/3D_Town_method.png">
            <h3 class="subtitle has-text-centered">
              <p style="font-family:Times New Roman">Figure 2. Given a single top-down image, we first estimate a coarse scene structure via monocular depth and landmark extraction to initialize the scene latent (Spatial Prior Initialization). The scene is divided into overlapping regions for localized synthesis and progressively fused into a coherent global latent (Region-based Generation & Fusion). Each region is completed using a two-stage masked rectified flow pipeline with a sparse structure generator and a structured latent generator (Spatial-aware 3D Completion). The final 3D scene is decoded from the completed structured latent.</p>
            </h3>


        </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-five-fifths">
        <h2 class="title is-3"><img id="painting_icon" width="5%" src="https://cdn-icons-png.flaticon.com/512/5379/5379860.png"> Qualitative Comparison</h2>
      </div>
    </div>
        <div class="columns is-centered has-text-centered">
          <div class="column is-six-fifths">
            <div class="content has-text-justified">
              <p>
                Qualitative examples from <b>SceneFuse-3D</b> and baselines on 3D scene asset generation from single images.  From the comparisons, we can find the <b>SceneFuse-3D</b> can provide more coherent and fine-grained 3D scenes than other baselines in various scene styles.
              </p>
            </div>
            <!-- <img id="model" width="100%" src="./static/images/3D_Town_comparision.png"> -->
            <video id="demo-video" width="100%" controls>
              <source src="./static/images/3DTown_demo.mp4" type="video/mp4">
              Your browser does not support the video tag.
            </video>
            <h3 class="subtitle has-text-centered">
              <p style="font-family:Times New Roman">Demo video for comparisons with other baselines on 3D scene generation.</p>
            </h3>
        </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@misc{zheng2025constructing3dtownsingle,
      title={Constructing a 3D Town from a Single Image},
      author={Kaizhi Zheng and Ruijian Zha and Jing Gu and Jie Yang and Xin Eric Wang},
      year={2025},
      eprint={2505.15765},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2505.15765},
    }
    </code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is adapted from <a rel="license"
            href="https://github.com/nerfies/nerfies.github.io">Nerfies</a> and <a rel="license"
            href="https://gligen.github.io/">GLIGEN</a>, licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
            This website is licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>