hushell/ref.bib

## ref.bib
@inproceedings{karras2020analyzing,
  title={Analyzing and improving the image quality of stylegan},
  author={Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={8110--8119},
  year={2020}
}

@article{radford2019language,
  title={Language models are unsupervised multitask learners},
  author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
  journal={OpenAI blog},
  volume={1},
  number={8},
  pages={9},
  year={2019}
}

@inproceedings{radford2021learning,
  title={Learning transferable visual models from natural language supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
  booktitle={International Conference on Machine Learning},
  pages={8748--8763},
  year={2021},
  organization={PMLR}
}

@inproceedings{ramesh2021zero,
  title={Zero-shot text-to-image generation},
  author={Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  booktitle={International Conference on Machine Learning},
  pages={8821--8831},
  year={2021},
  organization={PMLR}
}

@article{wang2022clip,
  title={CLIP-GEN: Language-Free Training of a Text-to-Image Generator with CLIP},
  author={Wang, Zihao and Liu, Wei and He, Qian and Wu, Xinglong and Yi, Zili},
  journal={arXiv preprint arXiv:2203.00386},
  year={2022}
}

@article{ramesh2022hierarchical,
  title={Hierarchical text-conditional image generation with clip latents},
  author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
  journal={arXiv preprint arXiv:2204.06125},
  year={2022}
}

@inproceedings{esser2021taming,
  title={Taming transformers for high-resolution image synthesis},
  author={Esser, Patrick and Rombach, Robin and Ommer, Bjorn},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={12873--12883},
  year={2021}
}

@article{dhariwal2021diffusion,
  title={Diffusion models beat gans on image synthesis},
  author={Dhariwal, Prafulla and Nichol, Alexander},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}

@article{karras2021alias,
  title={Alias-free generative adversarial networks},
  author={Karras, Tero and Aittala, Miika and Laine, Samuli and H{\"a}rk{\"o}nen, Erik and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  year={2021}
}

@article{nichol2021glide,
  title={Glide: Towards photorealistic image generation and editing with text-guided diffusion models},
  author={Nichol, Alex and Dhariwal, Prafulla and Ramesh, Aditya and Shyam, Pranav and Mishkin, Pamela and McGrew, Bob and Sutskever, Ilya and Chen, Mark},
  journal={arXiv preprint arXiv:2112.10741},
  year={2021}
}

@article{brock2018large,
  title={Large scale GAN training for high fidelity natural image synthesis},
  author={Brock, Andrew and Donahue, Jeff and Simonyan, Karen},
  journal={arXiv preprint arXiv:1809.11096},
  year={2018}
}

@inproceedings{xia2021tedigan,
  title={Tedigan: Text-guided diverse face image generation and manipulation},
  author={Xia, Weihao and Yang, Yujiu and Xue, Jing-Hao and Wu, Baoyuan},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={2256--2265},
  year={2021}
}

@article{tao2020df,
  title={Df-gan: Deep fusion generative adversarial networks for text-to-image synthesis},
  author={Tao, Ming and Tang, Hao and Wu, Songsong and Sebe, Nicu and Jing, Xiao-Yuan and Wu, Fei and Bao, Bingkun},
  journal={arXiv preprint arXiv:2008.05865},
  year={2020}
}

@article{sun2022anyface,
  title={AnyFace: Free-style Text-to-Face Synthesis and Manipulation},
  author={Sun, Jianxin and Deng, Qiyao and Li, Qi and Sun, Muyi and Ren, Min and Sun, Zhenan},
  journal={arXiv preprint arXiv:2203.15334},
  year={2022}
}

@inproceedings{patashnik2021styleclip,
  title={Styleclip: Text-driven manipulation of stylegan imagery},
  author={Patashnik, Or and Wu, Zongze and Shechtman, Eli and Cohen-Or, Daniel and Lischinski, Dani},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={2085--2094},
  year={2021}
}

@article{kingma2013auto,
  title={Auto-encoding variational bayes},
  author={Kingma, Diederik P and Welling, Max},
  journal={arXiv preprint arXiv:1312.6114},
  year={2013}
}

@article{goodfellow2014generative,
  title={Generative adversarial nets},
  author={Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  journal={Advances in neural information processing systems},
  volume={27},
  year={2014}
}

@article{tov2021designing,
  title={Designing an Encoder for StyleGAN Image Manipulation},
  author={Tov, Omer and Alaluf, Yuval and Nitzan, Yotam and Patashnik, Or and Cohen-Or, Daniel},
  journal={arXiv preprint arXiv:2102.02766},
  year={2021}
}

@inproceedings{karras2019style,
  title={A style-based generator architecture for generative adversarial networks},
  author={Karras, Tero and Laine, Samuli and Aila, Timo},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={4401--4410},
  year={2019}
}

@inproceedings{anokhin2021image,
  title={Image generators with conditionally-independent pixel synthesis},
  author={Anokhin, Ivan and Demochkin, Kirill and Khakhulin, Taras and Sterkin, Gleb and Lempitsky, Victor and Korzhenkov, Denis},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={14278--14287},
  year={2021}
}

@article{sauer2022stylegan,
  title={Stylegan-xl: Scaling stylegan to large diverse datasets},
  author={Sauer, Axel and Schwarz, Katja and Geiger, Andreas},
  journal={arXiv preprint arXiv:2202.00273},
  year={2022}
}

@article{van2017neural,
  title={Neural discrete representation learning},
  author={Van Den Oord, Aaron and Vinyals, Oriol and others},
  journal={Advances in neural information processing systems},
  volume={30},
  year={2017}
}

@article{kingma2018glow,
  title={Glow: Generative flow with invertible 1x1 convolutions},
  author={Kingma, Durk P and Dhariwal, Prafulla},
  journal={Advances in neural information processing systems},
  volume={31},
  year={2018}
}

@inproceedings{deng2009imagenet,
  title={Imagenet: A large-scale hierarchical image database},
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  booktitle={2009 IEEE conference on computer vision and pattern recognition},
  pages={248--255},
  year={2009},
  organization={Ieee}
}

@article{chang2022maskgit,
  title={MaskGIT: Masked Generative Image Transformer},
  author={Chang, Huiwen and Zhang, Han and Jiang, Lu and Liu, Ce and Freeman, William T},
  journal={arXiv preprint arXiv:2202.04200},
  year={2022}
}

@article{rombach2021high,
  title={High-Resolution Image Synthesis with Latent Diffusion Models},
  author={Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj{\"o}rn},
  journal={arXiv preprint arXiv:2112.10752},
  year={2021}
}

@inproceedings{reed2016generative,
  title={Generative adversarial text to image synthesis},
  author={Reed, Scott and Akata, Zeynep and Yan, Xinchen and Logeswaran, Lajanugen and Schiele, Bernt and Lee, Honglak},
  booktitle={International conference on machine learning},
  pages={1060--1069},
  year={2016},
  organization={PMLR}
}

@inproceedings{zhang2017stackgan,
  title={Stackgan: Text to photo-realistic image synthesis with stacked generative adversarial networks},
  author={Zhang, Han and Xu, Tao and Li, Hongsheng and Zhang, Shaoting and Wang, Xiaogang and Huang, Xiaolei and Metaxas, Dimitris N},
  booktitle={Proceedings of the IEEE international conference on computer vision},
  pages={5907--5915},
  year={2017}
}

@article{bernardi2016automatic,
  title={Automatic description generation from images: A survey of models, datasets, and evaluation measures},
  author={Bernardi, Raffaella and Cakici, Ruket and Elliott, Desmond and Erdem, Aykut and Erdem, Erkut and Ikizler-Cinbis, Nazli and Keller, Frank and Muscat, Adrian and Plank, Barbara},
  journal={Journal of Artificial Intelligence Research},
  volume={55},
  pages={409--442},
  year={2016}
}

@article{hossain2019comprehensive,
  title={A comprehensive survey of deep learning for image captioning},
  author={Hossain, MD Zakir and Sohel, Ferdous and Shiratuddin, Mohd Fairuz and Laga, Hamid},
  journal={ACM Computing Surveys (CsUR)},
  volume={51},
  number={6},
  pages={1--36},
  year={2019},
  publisher={ACM New York, NY, USA}
}

@article{zhang2018stackgan++,
  title={Stackgan++: Realistic image synthesis with stacked generative adversarial networks},
  author={Zhang, Han and Xu, Tao and Li, Hongsheng and Zhang, Shaoting and Wang, Xiaogang and Huang, Xiaolei and Metaxas, Dimitris N},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  volume={41},
  number={8},
  pages={1947--1962},
  year={2018},
  publisher={IEEE}
}

@inproceedings{xu2018attngan,
  title={Attngan: Fine-grained text to image generation with attentional generative adversarial networks},
  author={Xu, Tao and Zhang, Pengchuan and Huang, Qiuyuan and Zhang, Han and Gan, Zhe and Huang, Xiaolei and He, Xiaodong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={1316--1324},
  year={2018}
}

@inproceedings{li2019storygan,
  title={Storygan: A sequential conditional gan for story visualization},
  author={Li, Yitong and Gan, Zhe and Shen, Yelong and Liu, Jingjing and Cheng, Yu and Wu, Yuexin and Carin, Lawrence and Carlson, David and Gao, Jianfeng},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={6329--6338},
  year={2019}
}

@inproceedings{qiao2019mirrorgan,
  title={Mirrorgan: Learning text-to-image generation by redescription},
  author={Qiao, Tingting and Zhang, Jing and Xu, Duanqing and Tao, Dacheng},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={1505--1514},
  year={2019}
}

@article{hinz2020semantic,
  title={Semantic object accuracy for generative text-to-image synthesis},
  author={Hinz, Tobias and Heinrich, Stefan and Wermter, Stefan},
  journal={IEEE transactions on pattern analysis and machine intelligence},
  year={2020},
  publisher={IEEE}
}

@article{tao2020df,
  title={Df-gan: Deep fusion generative adversarial networks for text-to-image synthesis},
  author={Tao, Ming and Tang, Hao and Wu, Songsong and Sebe, Nicu and Jing, Xiao-Yuan and Wu, Fei and Bao, Bingkun},
  journal={arXiv preprint arXiv:2008.05865},
  year={2020}
}

@inproceedings{ramesh2021zero,
  title={Zero-shot text-to-image generation},
  author={Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  booktitle={International Conference on Machine Learning},
  pages={8821--8831},
  year={2021},
  organization={PMLR}
}

@article{crowson2022vqgan,
  title={Vqgan-clip: Open domain image generation and editing with natural language guidance},
  author={Crowson, Katherine and Biderman, Stella and Kornis, Daniel and Stander, Dashiell and Hallahan, Eric and Castricato, Louis and Raff, Edward},
  journal={arXiv preprint arXiv:2204.08583},
  year={2022}
}

@article{saharia2022photorealistic,
  title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},
  author={Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily and Ghasemipour, Seyed Kamyar Seyed and Ayan, Burcu Karagol and Mahdavi, S Sara and Lopes, Rapha Gontijo and others},
  journal={arXiv preprint arXiv:2205.11487},
  year={2022}
}

@article{ramesh2022hierarchical,
  title={Hierarchical text-conditional image generation with clip latents},
  author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
  journal={arXiv preprint arXiv:2204.06125},
  year={2022}
}

@article{yu2022scaling,
  title={Scaling autoregressive models for content-rich text-to-image generation},
  author={Yu, Jiahui and Xu, Yuanzhong and Koh, Jing Yu and Luong, Thang and Baid, Gunjan and Wang, Zirui and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Ayan, Burcu Karagol and others},
  journal={arXiv preprint arXiv:2206.10789},
  year={2022}
}

@article{ding2021cogview,
  title={Cogview: Mastering text-to-image generation via transformers},
  author={Ding, Ming and Yang, Zhuoyi and Hong, Wenyi and Zheng, Wendi and Zhou, Chang and Yin, Da and Lin, Junyang and Zou, Xu and Shao, Zhou and Yang, Hongxia and others},
  journal={Advances in Neural Information Processing Systems},
  volume={34},
  pages={19822--19835},
  year={2021}
}

@article{ding2022cogview2,
  title={CogView2: Faster and Better Text-to-Image Generation via Hierarchical Transformers},
  author={Ding, Ming and Zheng, Wendi and Hong, Wenyi and Tang, Jie},
  journal={arXiv preprint arXiv:2204.14217},
  year={2022}
}

@article{ho2020denoising,
  title={Denoising diffusion probabilistic models},
  author={Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
  journal={Advances in Neural Information Processing Systems},
  volume={33},
  pages={6840--6851},
  year={2020}
}


@inproceedings{Karras2019stylegan2,
  title     = {Analyzing and Improving the Image Quality of {StyleGAN}},
  author    = {Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
  booktitle = {Proc. CVPR},
  year      = {2020}
}
	@inproceedings{karras2020analyzing,
	title={Analyzing and improving the image quality of stylegan},
	author={Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo},
	booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
	pages={8110--8119},
	year={2020}
	}

	@article{radford2019language,
	title={Language models are unsupervised multitask learners},
	author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
	journal={OpenAI blog},
	volume={1},
	number={8},
	pages={9},
	year={2019}
	}

	@inproceedings{radford2021learning,
	title={Learning transferable visual models from natural language supervision},
	author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
	booktitle={International Conference on Machine Learning},
	pages={8748--8763},
	year={2021},
	organization={PMLR}
	}

	@inproceedings{ramesh2021zero,
	title={Zero-shot text-to-image generation},
	author={Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
	booktitle={International Conference on Machine Learning},
	pages={8821--8831},
	year={2021},
	organization={PMLR}
	}

	@article{wang2022clip,
	title={CLIP-GEN: Language-Free Training of a Text-to-Image Generator with CLIP},
	author={Wang, Zihao and Liu, Wei and He, Qian and Wu, Xinglong and Yi, Zili},
	journal={arXiv preprint arXiv:2203.00386},
	year={2022}
	}

	@article{ramesh2022hierarchical,
	title={Hierarchical text-conditional image generation with clip latents},
	author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
	journal={arXiv preprint arXiv:2204.06125},
	year={2022}
	}

	@inproceedings{esser2021taming,
	title={Taming transformers for high-resolution image synthesis},
	author={Esser, Patrick and Rombach, Robin and Ommer, Bjorn},
	booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
	pages={12873--12883},
	year={2021}
	}

	@article{dhariwal2021diffusion,
	title={Diffusion models beat gans on image synthesis},
	author={Dhariwal, Prafulla and Nichol, Alexander},
	journal={Advances in Neural Information Processing Systems},
	volume={34},
	year={2021}
	}

	@article{karras2021alias,
	title={Alias-free generative adversarial networks},
	author={Karras, Tero and Aittala, Miika and Laine, Samuli and H{\"a}rk{\"o}nen, Erik and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo},
	journal={Advances in Neural Information Processing Systems},
	volume={34},
	year={2021}
	}

	@article{nichol2021glide,
	title={Glide: Towards photorealistic image generation and editing with text-guided diffusion models},
	author={Nichol, Alex and Dhariwal, Prafulla and Ramesh, Aditya and Shyam, Pranav and Mishkin, Pamela and McGrew, Bob and Sutskever, Ilya and Chen, Mark},
	journal={arXiv preprint arXiv:2112.10741},
	year={2021}
	}

	@article{brock2018large,
	title={Large scale GAN training for high fidelity natural image synthesis},
	author={Brock, Andrew and Donahue, Jeff and Simonyan, Karen},
	journal={arXiv preprint arXiv:1809.11096},
	year={2018}
	}

	@inproceedings{xia2021tedigan,
	title={Tedigan: Text-guided diverse face image generation and manipulation},
	author={Xia, Weihao and Yang, Yujiu and Xue, Jing-Hao and Wu, Baoyuan},
	booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
	pages={2256--2265},
	year={2021}
	}

	@article{tao2020df,
	title={Df-gan: Deep fusion generative adversarial networks for text-to-image synthesis},
	author={Tao, Ming and Tang, Hao and Wu, Songsong and Sebe, Nicu and Jing, Xiao-Yuan and Wu, Fei and Bao, Bingkun},
	journal={arXiv preprint arXiv:2008.05865},
	year={2020}
	}

	@article{sun2022anyface,
	title={AnyFace: Free-style Text-to-Face Synthesis and Manipulation},
	author={Sun, Jianxin and Deng, Qiyao and Li, Qi and Sun, Muyi and Ren, Min and Sun, Zhenan},
	journal={arXiv preprint arXiv:2203.15334},
	year={2022}
	}

	@inproceedings{patashnik2021styleclip,
	title={Styleclip: Text-driven manipulation of stylegan imagery},
	author={Patashnik, Or and Wu, Zongze and Shechtman, Eli and Cohen-Or, Daniel and Lischinski, Dani},
	booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
	pages={2085--2094},
	year={2021}
	}

	@article{kingma2013auto,
	title={Auto-encoding variational bayes},
	author={Kingma, Diederik P and Welling, Max},
	journal={arXiv preprint arXiv:1312.6114},
	year={2013}
	}

	@article{goodfellow2014generative,
	title={Generative adversarial nets},
	author={Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
	journal={Advances in neural information processing systems},
	volume={27},
	year={2014}
	}

	@article{tov2021designing,
	title={Designing an Encoder for StyleGAN Image Manipulation},
	author={Tov, Omer and Alaluf, Yuval and Nitzan, Yotam and Patashnik, Or and Cohen-Or, Daniel},
	journal={arXiv preprint arXiv:2102.02766},
	year={2021}
	}

	@inproceedings{karras2019style,
	title={A style-based generator architecture for generative adversarial networks},
	author={Karras, Tero and Laine, Samuli and Aila, Timo},
	booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
	pages={4401--4410},
	year={2019}
	}

	@inproceedings{anokhin2021image,
	title={Image generators with conditionally-independent pixel synthesis},
	author={Anokhin, Ivan and Demochkin, Kirill and Khakhulin, Taras and Sterkin, Gleb and Lempitsky, Victor and Korzhenkov, Denis},
	booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
	pages={14278--14287},
	year={2021}
	}

	@article{sauer2022stylegan,
	title={Stylegan-xl: Scaling stylegan to large diverse datasets},
	author={Sauer, Axel and Schwarz, Katja and Geiger, Andreas},
	journal={arXiv preprint arXiv:2202.00273},
	year={2022}
	}

	@article{van2017neural,
	title={Neural discrete representation learning},
	author={Van Den Oord, Aaron and Vinyals, Oriol and others},
	journal={Advances in neural information processing systems},
	volume={30},
	year={2017}
	}

	@article{kingma2018glow,
	title={Glow: Generative flow with invertible 1x1 convolutions},
	author={Kingma, Durk P and Dhariwal, Prafulla},
	journal={Advances in neural information processing systems},
	volume={31},
	year={2018}
	}

	@inproceedings{deng2009imagenet,
	title={Imagenet: A large-scale hierarchical image database},
	author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
	booktitle={2009 IEEE conference on computer vision and pattern recognition},
	pages={248--255},
	year={2009},
	organization={Ieee}
	}

	@article{chang2022maskgit,
	title={MaskGIT: Masked Generative Image Transformer},
	author={Chang, Huiwen and Zhang, Han and Jiang, Lu and Liu, Ce and Freeman, William T},
	journal={arXiv preprint arXiv:2202.04200},
	year={2022}
	}

	@article{rombach2021high,
	title={High-Resolution Image Synthesis with Latent Diffusion Models},
	author={Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj{\"o}rn},
	journal={arXiv preprint arXiv:2112.10752},
	year={2021}
	}

	@inproceedings{reed2016generative,
	title={Generative adversarial text to image synthesis},
	author={Reed, Scott and Akata, Zeynep and Yan, Xinchen and Logeswaran, Lajanugen and Schiele, Bernt and Lee, Honglak},
	booktitle={International conference on machine learning},
	pages={1060--1069},
	year={2016},
	organization={PMLR}
	}

	@inproceedings{zhang2017stackgan,
	title={Stackgan: Text to photo-realistic image synthesis with stacked generative adversarial networks},
	author={Zhang, Han and Xu, Tao and Li, Hongsheng and Zhang, Shaoting and Wang, Xiaogang and Huang, Xiaolei and Metaxas, Dimitris N},
	booktitle={Proceedings of the IEEE international conference on computer vision},
	pages={5907--5915},
	year={2017}
	}

	@article{bernardi2016automatic,
	title={Automatic description generation from images: A survey of models, datasets, and evaluation measures},
	author={Bernardi, Raffaella and Cakici, Ruket and Elliott, Desmond and Erdem, Aykut and Erdem, Erkut and Ikizler-Cinbis, Nazli and Keller, Frank and Muscat, Adrian and Plank, Barbara},
	journal={Journal of Artificial Intelligence Research},
	volume={55},
	pages={409--442},
	year={2016}
	}

	@article{hossain2019comprehensive,
	title={A comprehensive survey of deep learning for image captioning},
	author={Hossain, MD Zakir and Sohel, Ferdous and Shiratuddin, Mohd Fairuz and Laga, Hamid},
	journal={ACM Computing Surveys (CsUR)},
	volume={51},
	number={6},
	pages={1--36},
	year={2019},
	publisher={ACM New York, NY, USA}
	}

	@article{zhang2018stackgan++,
	title={Stackgan++: Realistic image synthesis with stacked generative adversarial networks},
	author={Zhang, Han and Xu, Tao and Li, Hongsheng and Zhang, Shaoting and Wang, Xiaogang and Huang, Xiaolei and Metaxas, Dimitris N},
	journal={IEEE transactions on pattern analysis and machine intelligence},
	volume={41},
	number={8},
	pages={1947--1962},
	year={2018},
	publisher={IEEE}
	}

	@inproceedings{xu2018attngan,
	title={Attngan: Fine-grained text to image generation with attentional generative adversarial networks},
	author={Xu, Tao and Zhang, Pengchuan and Huang, Qiuyuan and Zhang, Han and Gan, Zhe and Huang, Xiaolei and He, Xiaodong},
	booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
	pages={1316--1324},
	year={2018}
	}

	@inproceedings{li2019storygan,
	title={Storygan: A sequential conditional gan for story visualization},
	author={Li, Yitong and Gan, Zhe and Shen, Yelong and Liu, Jingjing and Cheng, Yu and Wu, Yuexin and Carin, Lawrence and Carlson, David and Gao, Jianfeng},
	booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
	pages={6329--6338},
	year={2019}
	}

	@inproceedings{qiao2019mirrorgan,
	title={Mirrorgan: Learning text-to-image generation by redescription},
	author={Qiao, Tingting and Zhang, Jing and Xu, Duanqing and Tao, Dacheng},
	booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
	pages={1505--1514},
	year={2019}
	}

	@article{hinz2020semantic,
	title={Semantic object accuracy for generative text-to-image synthesis},
	author={Hinz, Tobias and Heinrich, Stefan and Wermter, Stefan},
	journal={IEEE transactions on pattern analysis and machine intelligence},
	year={2020},
	publisher={IEEE}
	}

	@article{tao2020df,
	title={Df-gan: Deep fusion generative adversarial networks for text-to-image synthesis},
	author={Tao, Ming and Tang, Hao and Wu, Songsong and Sebe, Nicu and Jing, Xiao-Yuan and Wu, Fei and Bao, Bingkun},
	journal={arXiv preprint arXiv:2008.05865},
	year={2020}
	}

	@inproceedings{ramesh2021zero,
	title={Zero-shot text-to-image generation},
	author={Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
	booktitle={International Conference on Machine Learning},
	pages={8821--8831},
	year={2021},
	organization={PMLR}
	}

	@article{crowson2022vqgan,
	title={Vqgan-clip: Open domain image generation and editing with natural language guidance},
	author={Crowson, Katherine and Biderman, Stella and Kornis, Daniel and Stander, Dashiell and Hallahan, Eric and Castricato, Louis and Raff, Edward},
	journal={arXiv preprint arXiv:2204.08583},
	year={2022}
	}

	@article{saharia2022photorealistic,
	title={Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},
	author={Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily and Ghasemipour, Seyed Kamyar Seyed and Ayan, Burcu Karagol and Mahdavi, S Sara and Lopes, Rapha Gontijo and others},
	journal={arXiv preprint arXiv:2205.11487},
	year={2022}
	}

	@article{ramesh2022hierarchical,
	title={Hierarchical text-conditional image generation with clip latents},
	author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
	journal={arXiv preprint arXiv:2204.06125},
	year={2022}
	}

	@article{yu2022scaling,
	title={Scaling autoregressive models for content-rich text-to-image generation},
	author={Yu, Jiahui and Xu, Yuanzhong and Koh, Jing Yu and Luong, Thang and Baid, Gunjan and Wang, Zirui and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Ayan, Burcu Karagol and others},
	journal={arXiv preprint arXiv:2206.10789},
	year={2022}
	}

	@article{ding2021cogview,
	title={Cogview: Mastering text-to-image generation via transformers},
	author={Ding, Ming and Yang, Zhuoyi and Hong, Wenyi and Zheng, Wendi and Zhou, Chang and Yin, Da and Lin, Junyang and Zou, Xu and Shao, Zhou and Yang, Hongxia and others},
	journal={Advances in Neural Information Processing Systems},
	volume={34},
	pages={19822--19835},
	year={2021}
	}

	@article{ding2022cogview2,
	title={CogView2: Faster and Better Text-to-Image Generation via Hierarchical Transformers},
	author={Ding, Ming and Zheng, Wendi and Hong, Wenyi and Tang, Jie},
	journal={arXiv preprint arXiv:2204.14217},
	year={2022}
	}

	@article{ho2020denoising,
	title={Denoising diffusion probabilistic models},
	author={Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
	journal={Advances in Neural Information Processing Systems},
	volume={33},
	pages={6840--6851},
	year={2020}
	}


	@inproceedings{Karras2019stylegan2,
	title = {Analyzing and Improving the Image Quality of {StyleGAN}},
	author = {Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
	booktitle = {Proc. CVPR},
	year = {2020}
	}